flan-t5-large-squad2 / modeling_t5qa.py

Update modeling_t5qa.py

ae45ee3 over 1 year ago

9.09 kB

	import copy
	import warnings
	from typing import Optional, Tuple, Union

	import torch
	from torch import nn
	from torch.nn import CrossEntropyLoss

	from transformers import AutoModelForQuestionAnswering
	from transformers.modeling_outputs import (
	BaseModelOutput,
	Seq2SeqQuestionAnsweringModelOutput,
	)
	from transformers.models.t5.configuration_t5 import T5Config
	from transformers.models.t5.modeling_t5 import T5PreTrainedModel, T5Stack


	class T5ForQuestionAnswering(T5PreTrainedModel):
	_keys_to_ignore_on_load_missing = [
	r"encoder.embed_tokens.weight",
	r"decoder.embed_tokens.weight",
	]
	_keys_to_ignore_on_load_unexpected = [
	r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
	]

	def __init__(self, config: T5Config):
	super().__init__(config)
	self.model_dim = config.d_model

	self.shared = nn.Embedding(config.vocab_size, config.d_model)

	encoder_config = copy.deepcopy(config)
	encoder_config.is_decoder = False
	encoder_config.use_cache = False
	encoder_config.is_encoder_decoder = False
	self.encoder = T5Stack(encoder_config, self.shared)

	decoder_config = copy.deepcopy(config)
	decoder_config.is_decoder = True
	decoder_config.is_encoder_decoder = False
	decoder_config.num_layers = config.num_decoder_layers
	self.decoder = T5Stack(decoder_config, self.shared)

	self.num_labels = config.num_labels
	self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

	# Initialize weights and apply final processing
	self.post_init()

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	def get_input_embeddings(self):
	return self.shared

	def set_input_embeddings(self, new_embeddings):
	self.shared = new_embeddings
	self.encoder.set_input_embeddings(new_embeddings)
	self.decoder.set_input_embeddings(new_embeddings)

	def get_encoder(self):
	return self.encoder

	def get_decoder(self):
	return self.decoder

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	decoder_attention_mask: Optional[torch.BoolTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	decoder_head_mask: Optional[torch.FloatTensor] = None,
	cross_attn_head_mask: Optional[torch.Tensor] = None,
	encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	start_positions: Optional[torch.LongTensor] = None,
	end_positions: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
	r"""
	start_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the start of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
	are not taken into account for computing the loss.
	end_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the end of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
	are not taken into account for computing the loss.

	Returns:
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	if start_positions is not None and end_positions is not None:
	use_cache = False

	# Copied from models.bart.modeling_bart.BartModel.forward
	# different to other models, T5 automatically creates decoder_input_ids from
	# input_ids if no decoder_input_ids are provided
	if decoder_input_ids is None and decoder_inputs_embeds is None:
	if input_ids is None:
	raise ValueError(
	"If no `decoder_input_ids` or `decoder_inputs_embeds` are "
	"passed, `input_ids` cannot be `None`. Please pass either "
	"`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
	)
	decoder_input_ids = self._shift_right(input_ids)

	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
	if head_mask is not None and decoder_head_mask is None:
	if self.config.num_layers == self.config.num_decoder_layers:
	warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
	decoder_head_mask = head_mask

	# Encode if needed (training, first prediction pass)
	if encoder_outputs is None:
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
	encoder_outputs = BaseModelOutput(
	last_hidden_state=encoder_outputs[0],
	hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
	attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
	)

	hidden_states = encoder_outputs[0]

	# Decode
	decoder_outputs = self.decoder(
	input_ids=decoder_input_ids,
	attention_mask=decoder_attention_mask,
	inputs_embeds=decoder_inputs_embeds,
	past_key_values=None,
	encoder_hidden_states=hidden_states,
	encoder_attention_mask=attention_mask,
	head_mask=decoder_head_mask,
	cross_attn_head_mask=cross_attn_head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = decoder_outputs[0]

	logits = self.qa_outputs(sequence_output)
	start_logits, end_logits = logits.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1).contiguous()
	end_logits = end_logits.squeeze(-1).contiguous()

	total_loss = None
	if start_positions is not None and end_positions is not None:
	# If we are on multi-GPU, split add a dimension
	if len(start_positions.size()) > 1:
	start_positions = start_positions.squeeze(-1).to(start_logits.device)
	if len(end_positions.size()) > 1:
	end_positions = end_positions.squeeze(-1).to(end_logits.device)
	# sometimes the start/end positions are outside our model inputs, we ignore these terms
	ignored_index = start_logits.size(1)
	start_positions = start_positions.clamp(0, ignored_index)
	end_positions = end_positions.clamp(0, ignored_index)

	loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
	start_loss = loss_fct(start_logits, start_positions)
	end_loss = loss_fct(end_logits, end_positions)
	total_loss = (start_loss + end_loss) / 2

	if not return_dict:
	output = (start_logits, end_logits) + decoder_outputs[1:] + encoder_outputs
	return ((total_loss,) + output) if total_loss is not None else output

	return Seq2SeqQuestionAnsweringModelOutput(
	loss=total_loss,
	start_logits=start_logits,
	end_logits=end_logits,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	cross_attentions=decoder_outputs.cross_attentions,
	encoder_last_hidden_state=encoder_outputs.last_hidden_state,
	encoder_hidden_states=encoder_outputs.hidden_states,
	encoder_attentions=encoder_outputs.attentions,
	)

	try:
	AutoModelForQuestionAnswering.register(T5Config, T5ForQuestionAnswering)
	except ValueError:
	pass