import copy import math import warnings from typing import List, Optional, Tuple, Union from transformers import MT5PreTrainedModel from transformers.models.mt5 import MT5Stack from transformers.modeling_outputs import Seq2SeqModelOutput,Seq2SeqLMOutput, BaseModelOutput from transformers.utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ) from transformers.utils.model_parallel_utils import assert_device_map, get_device_map import torch from torch import nn from torch.nn import CrossEntropyLoss from .config import MT5Config from .docstrings import ( PARALLELIZE_DOCSTRING, DEPARALLELIZE_DOCSTRING, __HEAD_MASK_WARNING_MSG, MT5_START_DOCSTRING, MT5_INPUTS_DOCSTRING, ) logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "MT5Config" _CHECKPOINT_FOR_DOC = "mt5-small" class MT5Model(MT5PreTrainedModel): r""" Examples: ```python >>> from transformers import MT5Model, AutoTokenizer >>> model = MT5Model.from_pretrained("google/mt5-small") >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, return_tensors="pt") >>> labels = tokenizer(text_target=summary, return_tensors="pt") >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) >>> hidden_states = outputs.last_hidden_state ```""" model_type = "mt5" config_class = MT5Config _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5 def __init__(self, config: MT5Config): super().__init__(config) self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model) if config.shared_embedding: self.decoder_embedding = self.encoder_embedding else: self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model) encoder_config = copy.deepcopy(config) encoder_config.is_decoder = False encoder_config.use_cache = False encoder_config.is_encoder_decoder = False self.encoder = MT5Stack(encoder_config, self.encoder_embedding) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True decoder_config.is_encoder_decoder = False decoder_config.num_layers = config.num_decoder_layers self.decoder = MT5Stack(decoder_config, self.decoder_emebedding) # Initialize weights and apply final processing self.post_init() # Model parallel self.model_parallel = False self.device_map = None # Copied from transformers.models.t5.modeling_t5.T5Model.parallelize def parallelize(self, device_map=None): warnings.warn( "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':" " 0, 'encoder.block.1': 1, ...}", FutureWarning, ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None else device_map ) assert_device_map(self.device_map, len(self.encoder.block)) self.encoder.parallelize(self.device_map) self.decoder.parallelize(self.device_map) self.model_parallel = True @add_start_docstrings(DEPARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize def deparallelize(self): warnings.warn( "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") self.decoder = self.decoder.to("cpu") self.model_parallel = False self.device_map = None torch.cuda.empty_cache() # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings def get_input_embeddings(self): return self.encoder_embedding # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings def set_input_embeddings(self, new_embeddings): self.encoder_embedding = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder def get_encoder(self): return self.encoder # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder def get_decoder(self): return self.decoder # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) # Copied from transformers.models.t5.modeling_t5.T5Model.forward with T5->MT5, t5->mt5 def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.BoolTensor] = None, head_mask: Optional[torch.FloatTensor] = None, decoder_head_mask: Optional[torch.FloatTensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, inputs_embeds: Optional[torch.Tensor] = None, decoder_inputs_embeds: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: r""" Returns: Example: ```python >>> from transformers import AutoTokenizer, MT5Model >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small") >>> model = MT5Model.from_pretrained("mt5-small") >>> input_ids = tokenizer( ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" ... ).input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model. >>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg. >>> decoder_input_ids = model._shift_right(decoder_input_ids) >>> # forward pass >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask if head_mask is not None and decoder_head_mask is None: if self.config.num_layers == self.config.num_decoder_layers: warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) decoder_head_mask = head_mask # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ) hidden_states = encoder_outputs[0] # Set device for model parallelism if self.model_parallel: torch.cuda.set_device(self.decoder.first_device) hidden_states = hidden_states.to(self.decoder.first_device) if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) if attention_mask is not None: attention_mask = attention_mask.to(self.decoder.first_device) if decoder_attention_mask is not None: decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_values=past_key_values, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if not return_dict: return decoder_outputs + encoder_outputs return Seq2SeqModelOutput( last_hidden_state=decoder_outputs.last_hidden_state, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_outputs.last_hidden_state, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) @add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING) class MT5ForConditionalGeneration(MT5PreTrainedModel): r""" Examples: ```python >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." >>> summary = "Weiter Verhandlung in Syrien." >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt") >>> outputs = model(**inputs) >>> loss = outputs.loss ```""" model_type = "mt5" config_class = MT5Config _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 def __init__(self, config: MT5Config): super().__init__(config) self.model_dim = config.d_model self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model) if config.shared_embedding: self.decoder_embedding = self.encoder_embedding else: self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model) encoder_config = copy.deepcopy(config) encoder_config.is_decoder = False encoder_config.use_cache = False encoder_config.is_encoder_decoder = False self.encoder = MT5Stack(encoder_config, self.encoder_embedding) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True decoder_config.is_encoder_decoder = False decoder_config.num_layers = config.num_decoder_layers self.decoder = MT5Stack(decoder_config, self.decoder_emebedding) self.lm_head = nn.Linear(config.d_model, config.decoder_vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() # Model parallel self.model_parallel = False self.device_map = None @add_start_docstrings(PARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize def parallelize(self, device_map=None): warnings.warn( "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you" " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also" " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance" " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}", FutureWarning, ) self.device_map = ( get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) if device_map is None else device_map ) assert_device_map(self.device_map, len(self.encoder.block)) self.encoder.parallelize(self.device_map) self.decoder.parallelize(self.device_map) self.lm_head = self.lm_head.to(self.decoder.first_device) self.model_parallel = True @add_start_docstrings(DEPARALLELIZE_DOCSTRING) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize def deparallelize(self): warnings.warn( "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", FutureWarning, ) self.encoder.deparallelize() self.decoder.deparallelize() self.encoder = self.encoder.to("cpu") self.decoder = self.decoder.to("cpu") self.lm_head = self.lm_head.to("cpu") self.model_parallel = False self.device_map = None torch.cuda.empty_cache() # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings def get_input_embeddings(self): return self.encoder_embedding # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings def set_input_embeddings(self, new_embeddings): self.encoder_embedding = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings def get_output_embeddings(self): return self.lm_head # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder def get_encoder(self): return self.encoder # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder def get_decoder(self): return self.decoder @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5 def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.BoolTensor] = None, head_mask: Optional[torch.FloatTensor] = None, decoder_head_mask: Optional[torch.FloatTensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` Returns: Examples: ```python >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small") >>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small") >>> # training >>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids >>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids >>> outputs = model(input_ids=input_ids, labels=labels) >>> loss = outputs.loss >>> logits = outputs.logits >>> # inference >>> input_ids = tokenizer( ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" ... ).input_ids # Batch size 1 >>> outputs = model.generate(input_ids) >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) >>> # studies have shown that owning a dog is good for you. ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask if head_mask is not None and decoder_head_mask is None: if self.config.num_layers == self.config.num_decoder_layers: warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) decoder_head_mask = head_mask # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ) hidden_states = encoder_outputs[0] if self.model_parallel: torch.cuda.set_device(self.decoder.first_device) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(labels) # Set device for model parallelism if self.model_parallel: torch.cuda.set_device(self.decoder.first_device) hidden_states = hidden_states.to(self.decoder.first_device) if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) if attention_mask is not None: attention_mask = attention_mask.to(self.decoder.first_device) if decoder_attention_mask is not None: decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) # Decode decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, inputs_embeds=decoder_inputs_embeds, past_key_values=past_key_values, encoder_hidden_states=hidden_states, encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = decoder_outputs[0] # Set device for model parallelism if self.model_parallel: torch.cuda.set_device(self.encoder.first_device) self.lm_head = self.lm_head.to(self.encoder.first_device) sequence_output = sequence_output.to(self.lm_head.weight.device) if self.config.tie_word_embeddings: # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 sequence_output = sequence_output * (self.model_dim**-0.5) lm_logits = self.lm_head(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-100) # move labels to correct device to enable PP labels = labels.to(lm_logits.device) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 if not return_dict: output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs return ((loss,) + output) if loss is not None else output return Seq2SeqLMOutput( loss=loss, logits=lm_logits, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_outputs.last_hidden_state, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, head_mask=None, decoder_head_mask=None, decoder_attention_mask=None, cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, **kwargs, ): # cut decoder_input_ids if past_key_values is used if past_key_values is not None: past_length = past_key_values[0][0].shape[2] # Some generation methods already pass only the last input ID if input_ids.shape[1] > past_length: remove_prefix_length = past_length else: # Default to old behavior: keep only final ID remove_prefix_length = input_ids.shape[1] - 1 input_ids = input_ids[:, remove_prefix_length:] return { "decoder_input_ids": input_ids, "past_key_values": past_key_values, "encoder_outputs": encoder_outputs, "attention_mask": attention_mask, "head_mask": head_mask, "decoder_head_mask": decoder_head_mask, "decoder_attention_mask": decoder_attention_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, } # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache def _reorder_cache(self, past_key_values, beam_idx): # if decoder past is not included in output # speedy decoding is disabled and no need to reorder if past_key_values is None: logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") return past_key_values reordered_decoder_past = () for layer_past_states in past_key_values: # get the correct batch idx from layer past batch dim # batch dim of `past` is at 2nd position reordered_layer_past_states = () for layer_past_state in layer_past_states: # need to set correct `past` for each of the four key / value states reordered_layer_past_states = reordered_layer_past_states + ( layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)), ) if reordered_layer_past_states[0].shape != layer_past_states[0].shape: raise ValueError( f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched" ) if len(reordered_layer_past_states) != len(layer_past_states): raise ValueError( f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched" ) reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) return reordered_decoder_past