from typing import List, Optional, Tuple, Union import torch from torch import nn from transformers.modeling_outputs import TokenClassifierOutput from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification from transformers.models.roberta.modeling_roberta import ROBERTA_INPUTS_DOCSTRING, add_start_docstrings_to_model_forward, add_code_sample_docstrings from extended_embeddings.extended_embeddings_model import ExtendedEmbeddigsRobertaModel _CONFIG_FOR_DOC = "RobertaConfig" class ExtendedEmbeddigsRobertaForTokenClassification(RobertaForTokenClassification): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = ExtendedEmbeddigsRobertaModel(config, add_pooling_layer=False) classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size + 3, config.num_labels) # Initialize weights and apply final processing self.post_init() @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC, expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", expected_loss=0.01, ) def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, per: Optional[torch.Tensor] = None, org: Optional[torch.Tensor] = None, loc: Optional[torch.Tensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, per=per, org=org, loc=loc, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: # move labels to correct device to enable model parallelism labels = labels.to(logits.device) loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )