Upload JointCTCAttentionEncoderDecoder

Browse files

Files changed (12) hide show

auto_wrappers.py +132 -0
config.json +288 -0
configuration_reguler.py +23 -0
ctc_scorer.py +311 -0
e_branchformer.py +252 -0
embeddings.py +86 -0
extractors.py +32 -0
generation_config.json +8 -0
modeling_reguler.py +484 -0
multi_head_gpt2.py +160 -0
pytorch_model.bin +3 -0
residual_clasiffier_gpt2.py +99 -0

auto_wrappers.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import copy
+import os
+from transformers import AutoConfig, AutoModelForCTC, PretrainedConfig
+from transformers.dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
+from transformers.models.auto.auto_factory import _get_model_class
+from .extractors import Conv2dFeatureExtractor
+class FeatureExtractionInitModifier(type):
+    def __new__(cls, name, bases, dct):
+        # Create the class using the original definition
+        new_cls = super().__new__(cls, name, bases, dct)
+        # Save the original __init__ method
+        original_init = new_cls.__init__
+        # Modify the __init__ method dynamically
+        def new_init(self, *args, **kwargs):
+            original_init(self, *args, **kwargs)
+            if self.config.expect_2d_input:
+                getattr(self, self.base_model_prefix).feature_extractor = Conv2dFeatureExtractor(self.config)
+        # Replace the __init__ method with the modified version
+        new_cls.__init__ = new_init
+        return new_cls
+class CustomAutoModelForCTC(AutoModelForCTC):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        hub_kwargs_names = [
+            "cache_dir",
+            "code_revision",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "subfolder",
+            "use_auth_token",
+        ]
+        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+        if not isinstance(config, PretrainedConfig):
+            kwargs_orig = copy.deepcopy(kwargs)
+            # ensure not to pollute the config object with torch_dtype="auto" - since it's
+            # meaningless in the context of the config object - torch.dtype values are acceptable
+            if kwargs.get("torch_dtype", None) == "auto":
+                _ = kwargs.pop("torch_dtype")
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                return_unused_kwargs=True,
+                trust_remote_code=trust_remote_code,
+                **hub_kwargs,
+                **kwargs,
+            )
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            model_class = get_class_from_dynamic_module(
+                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
+            )
+            model_class = FeatureExtractionInitModifier(model_class.__name__, (model_class,), {})
+            _ = hub_kwargs.pop("code_revision", None)
+            if os.path.isdir(pretrained_model_name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            model_class = FeatureExtractionInitModifier(model_class.__name__, (model_class,), {})
+            return model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+        )
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, config._name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            if "--" in class_ref:
+                repo_id, class_ref = class_ref.split("--")
+            else:
+                repo_id = config.name_or_path
+            model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
+            if os.path.isdir(config._name_or_path):
+                model_class.register_for_auto_class(cls.__name__)
+            else:
+                cls.register(config.__class__, model_class, exist_ok=True)
+            _ = kwargs.pop("code_revision", None)
+            model_class = FeatureExtractionInitModifier(model_class.__name__, (model_class,), {})
+            return model_class._from_config(config, **kwargs)
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            model_class = FeatureExtractionInitModifier(model_class.__name__, (model_class,), {})
+            return model_class._from_config(config, **kwargs)
+        raise ValueError(
+            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+        )

config.json ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "/Users/alexanderpolok/PycharmProjects/huggingface_asr/checkpoint-378950",
+  "architectures": [
+    "JointCTCAttentionEncoderDecoder"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_reguler.JointCTCAttentionEncoderDecoderConfig",
+    "AutoModelForSpeechSeq2Seq": "modeling_reguler.JointCTCAttentionEncoderDecoder"
+  },
+  "ctc_weight": 0.3,
+  "decoder": {
+    "_name_or_path": "Lakoc/gpt2_512h_8l_add_head6_04",
+    "activation_function": "gelu_new",
+    "add_cross_attention": true,
+    "architectures": null,
+    "attn_pdrop": 0.1,
+    "average_logits": false,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "embd_pdrop": 0.1,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 1,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_locations": [
+      5
+    ],
+    "head_weights": [
+      0.6,
+      0.4
+    ],
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "gpt2-multi-head",
+    "n_embd": 512,
+    "n_head": 8,
+    "n_inner": 2048,
+    "n_layer": 8,
+    "n_positions": 1024,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "pos_emb_fixed": true,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "reorder_and_upcast_attn": false,
+    "repetition_penalty": 1.0,
+    "resid_pdrop": 0.1,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_attn_by_inverse_layer_idx": false,
+    "scale_attn_weights": true,
+    "sep_token_id": null,
+    "summary_activation": null,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": true,
+    "summary_type": "cls_index",
+    "summary_use_proj": true,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_additional_weights": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 5000
+  },
+  "decoder_pos_emb_fixed": true,
+  "decoder_start_token_id": 0,
+  "decoder_vocab_size": 5000,
+  "encoder": {
+    "_name_or_path": "Lakoc/ebranchformer_16l_512h",
+    "activation_dropout": 0.1,
+    "adapter_attn_dim": null,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "add_cross_attention": false,
+    "apply_spec_augment": false,
+    "apply_time_warp": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 1,
+    "chunk_size_feed_forward": 0,
+    "classifier_proj_size": 256,
+    "codevector_dim": 256,
+    "conformer_conv_dropout": 0.1,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_depthwise_kernel_size": 31,
+    "conv_dim": [
+      512,
+      512
+    ],
+    "conv_kernel": [
+      3,
+      3
+    ],
+    "conv_stride": [
+      2,
+      2
+    ],
+    "cross_attention_hidden_size": null,
+    "csgu_activation": "identity",
+    "csgu_conv_dropout": 0.1,
+    "csgu_kernel_size": 31,
+    "csgu_use_linear_after_conv": false,
+    "ctc_loss_reduction": "mean",
+    "ctc_zero_infinity": true,
+    "decoder_start_token_id": null,
+    "diversity_loss_weight": 0.1,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "do_stable_layer_norm": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "expect_2d_input": true,
+    "exponential_decay_length_penalty": null,
+    "fe_position_embeddings": true,
+    "feat_extract_activation": "gelu",
+    "feat_extract_norm": "group",
+    "feat_proj_dropout": 0.0,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.1,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.0,
+    "length_penalty": 1.0,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_length": 20,
+    "max_source_positions": 1024,
+    "merge_conv_kernel": 31,
+    "min_length": 0,
+    "model_type": "wav2vec2-ebranchformer",
+    "no_repeat_ngram_size": 0,
+    "num_adapter_layers": 3,
+    "num_attention_heads": 4,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_feat_extract_layers": 2,
+    "num_hidden_layers": 16,
+    "num_mel_bins": 80,
+    "num_negatives": 100,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_size": 512,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 3,
+    "position_embeddings_type": "relative",
+    "prefix": null,
+    "problem_type": null,
+    "proj_codevector_dim": 256,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rotary_embedding_base": 10000,
+    "second_dim_input_size": 80,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "tdnn_dilation": [
+      1,
+      2,
+      3,
+      1,
+      1
+    ],
+    "tdnn_dim": [
+      512,
+      512,
+      512,
+      512,
+      1500
+    ],
+    "tdnn_kernel": [
+      5,
+      3,
+      3,
+      1,
+      1
+    ],
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "time_warp_mode": "bicubic",
+    "time_warp_window": 5,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_fbanks": true,
+    "use_macaron_ff": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 5000,
+    "xvector_output_dim": 512
+  },
+  "encoder_ctc_loss_reduction": "mean",
+  "encoder_expect_2d_input": true,
+  "encoder_layerdrop": 0.0,
+  "encoder_pad_token_id": 3,
+  "encoder_second_dim_input_size": 80,
+  "encoder_vocab_size": 5000,
+  "is_encoder_decoder": true,
+  "lsm_factor": 0.1,
+  "model_type": "joint_aed_ctc_speech-encoder-decoder",
+  "pad_token_id": 3,
+  "shared_lm_head": false,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "torch_dtype": "float32",
+  "transformers_version": null
+}

configuration_reguler.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import AutoConfig, AutoModelForCausalLM, SpeechEncoderDecoderConfig
+from .auto_wrappers import CustomAutoModelForCTC
+from .e_branchformer import Wav2Vec2EBranchformerConfig, Wav2Vec2EBranchformerForCTC
+from .multi_head_gpt2 import GPT2LMMultiHeadModel, GPT2MultiHeadConfig
+from .residual_clasiffier_gpt2 import (
+    GPT2ResidualsLMHeadConfig,
+    GPT2ResidualsLMHeadModel,
+)
+AutoConfig.register("gpt2-multi-head", GPT2MultiHeadConfig)
+AutoModelForCausalLM.register(GPT2MultiHeadConfig, GPT2LMMultiHeadModel)
+AutoConfig.register("gpt2-residuals-head", GPT2ResidualsLMHeadConfig)
+AutoModelForCausalLM.register(GPT2ResidualsLMHeadConfig, GPT2ResidualsLMHeadModel)
+AutoConfig.register("wav2vec2-ebranchformer", Wav2Vec2EBranchformerConfig)
+CustomAutoModelForCTC.register(Wav2Vec2EBranchformerConfig, Wav2Vec2EBranchformerForCTC)
+class JointCTCAttentionEncoderDecoderConfig(SpeechEncoderDecoderConfig):
+    model_type = "joint_aed_ctc_speech-encoder-decoder"
+    is_composition = True

ctc_scorer.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# pylint: skip-file
+# Copied from: https://github.com/espnet/espnet/blob/master/espnet/nets/ctc_prefix_score.py
+import torch
+from transformers import GenerationConfig, LogitsProcessor
+class GenerationConfigWithCTC(GenerationConfig):
+    def __init__(self, ctc_weight=0.0, ctc_margin=0, **kwargs):
+        super().__init__(**kwargs)
+        self.ctc_weight = ctc_weight
+        self.ctc_margin = ctc_margin
+class CTCPrefixScoreTH(object):
+    """Batch processing of CTCPrefixScore
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the label probablities for multiple
+    hypotheses simultaneously
+    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
+    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
+    """
+    def __init__(self, x, xlens, blank, eos, margin=0):
+        """Construct CTC prefix scorer
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        :param torch.Tensor xlens: input lengths (B,)
+        :param int blank: blank label id
+        :param int eos: end-of-sequence id
+        :param int margin: margin parameter for windowing (0 means no windowing)
+        """
+        # In the comment lines,
+        # we assume T: input_length, B: batch size, W: beam width, O: output dim.
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.batch = x.size(0)
+        self.input_length = x.size(1)
+        self.odim = x.size(2)
+        self.dtype = x.dtype
+        self.device = torch.device("cuda:%d" % x.get_device()) if x.is_cuda else torch.device("cpu")
+        # Pad the rest of posteriors in the batch
+        # TODO(takaaki-hori): need a better way without for-loops
+        for i, l in enumerate(xlens):
+            if l < self.input_length:
+                x[i, l:, :] = self.logzero
+                x[i, l:, blank] = 0
+        # Reshape input x
+        xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+        xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+        self.x = torch.stack([xn, xb])  # (2, T, B, O)
+        self.end_frames = torch.as_tensor(xlens) - 1
+        # Setup CTC windowing
+        self.margin = margin
+        if margin > 0:
+            self.frame_ids = torch.arange(self.input_length, dtype=self.dtype, device=self.device)
+        # Base indices for index conversion
+        self.idx_bh = None
+        self.idx_b = torch.arange(self.batch, device=self.device)
+        self.idx_bo = (self.idx_b * self.odim).unsqueeze(1)
+    def __call__(self, y, state, scoring_ids=None, att_w=None):
+        """Compute CTC prefix scores for next labels
+        :param list y: prefix label sequences
+        :param tuple state: previous CTC state
+        :param torch.Tensor att_w: attention weights to decide CTC window
+        :return new_state, ctc_local_scores (BW, O)
+        """
+        # print(self.tokenizer.batch_decode(y))
+        output_length = len(y[0]) - 1  # ignore sos
+        last_ids = [yi[-1] for yi in y]  # last output label ids
+        n_bh = len(last_ids)  # batch * hyps
+        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
+        self.scoring_num = scoring_ids.size(-1) if scoring_ids is not None else 0
+        # prepare state info
+        if state is None:
+            r_prev = torch.full(
+                (self.input_length, 2, self.batch, n_hyps),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            r_prev[:, 1] = torch.cumsum(self.x[0, :, :, self.blank], 0).unsqueeze(2)
+            r_prev = r_prev.view(-1, 2, n_bh)
+            s_prev = 0.0
+            f_min_prev = 0
+            f_max_prev = 1
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+        # select input dimensions for scoring
+        if self.scoring_num > 0:
+            scoring_idmap = torch.full((n_bh, self.odim), -1, dtype=torch.long, device=self.device)
+            snum = self.scoring_num
+            if self.idx_bh is None or n_bh > len(self.idx_bh):
+                self.idx_bh = torch.arange(n_bh, device=self.device).view(-1, 1)
+            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = torch.arange(snum, device=self.device)
+            scoring_idx = (scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1, 1)).view(-1)
+            x_ = torch.index_select(self.x.view(2, -1, self.batch * self.odim), 2, scoring_idx).view(2, -1, n_bh, snum)
+        else:
+            scoring_ids = None
+            scoring_idmap = None
+            snum = self.odim
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1, n_bh, snum)
+        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
+        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
+        r = torch.full(
+            (self.input_length, 2, n_bh, snum),
+            self.logzero,
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if output_length == 0:
+            r[0, 0] = x_[0, 0]
+        r_sum = torch.logsumexp(r_prev, 1)
+        log_phi = r_sum.unsqueeze(2).repeat(1, 1, snum)
+        if scoring_ids is not None:
+            for idx in range(n_bh):
+                pos = scoring_idmap[idx, last_ids[idx]]
+                if pos >= 0:
+                    log_phi[:, idx, pos] = r_prev[:, 1, idx]
+        else:
+            for idx in range(n_bh):
+                log_phi[:, idx, last_ids[idx]] = r_prev[:, 1, idx]
+        # decide start and end frames based on attention weights
+        if att_w is not None and self.margin > 0:
+            f_arg = torch.matmul(att_w, self.frame_ids)
+            f_min = max(int(f_arg.min().cpu()), f_min_prev)
+            f_max = max(int(f_arg.max().cpu()), f_max_prev)
+            start = min(f_max_prev, max(f_min - self.margin, output_length, 1))
+            end = min(f_max + self.margin, self.input_length)
+        else:
+            f_min = f_max = 0
+            start = max(output_length, 1)
+            end = self.input_length
+        if start > end:
+            return torch.full_like(s_prev, self.logzero), (
+                r,
+                torch.full_like(s_prev, self.logzero),
+                f_min,
+                f_max,
+                scoring_idmap,
+            )
+        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
+        for t in range(start, end):
+            rp = r[t - 1]
+            rr = torch.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(2, 2, n_bh, snum)
+            r[t] = torch.logsumexp(rr, 1) + x_[:, t]
+        # compute log prefix probabilities log(psi)
+        log_phi_x = torch.cat((log_phi[0].unsqueeze(0), log_phi[:-1]), dim=0) + x_[0]
+        if scoring_ids is not None:
+            log_psi = torch.full((n_bh, self.odim), self.logzero, dtype=self.dtype, device=self.device)
+            log_psi_ = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+            for si in range(n_bh):
+                log_psi[si, scoring_ids[si]] = log_psi_[si]
+        else:
+            log_psi = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+        for si in range(n_bh):
+            log_psi[si, self.eos] = max(log_psi[si, self.eos], r_sum[self.end_frames[si // n_hyps], si])
+        # exclude blank probs
+        log_psi[:, self.blank] = self.logzero
+        token_scores = log_psi - s_prev
+        token_scores[token_scores == 0] = self.logzero
+        return token_scores, (r, log_psi, f_min, f_max, scoring_idmap)
+    def index_select_state(self, state, best_ids):
+        """Select CTC states according to best ids
+        :param state    : CTC state
+        :param best_ids : index numbers selected by beam pruning (B, W)
+        :return selected_state
+        """
+        r, s, f_min, f_max, scoring_idmap = state
+        # convert ids to BHO space
+        n_bh = len(s)
+        n_hyps = n_bh // self.batch
+        vidx = (best_ids + (self.idx_b * (n_hyps * self.odim)).view(-1, 1)).view(-1)
+        # select hypothesis scores
+        s_new = torch.index_select(s.view(-1), 0, vidx)
+        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        # convert ids to BHS space (S: scoring_num)
+        if scoring_idmap is not None:
+            snum = self.scoring_num
+            hyp_idx = (best_ids // self.odim + (self.idx_b * n_hyps).view(-1, 1)).view(-1)
+            label_ids = torch.fmod(best_ids, self.odim).view(-1)
+            score_idx = scoring_idmap[hyp_idx, label_ids]
+            score_idx[score_idx == -1] = 0
+            vidx = score_idx + hyp_idx * snum
+        else:
+            snum = self.odim
+        # select forward probabilities
+        r_new = torch.index_select(r.view(-1, 2, n_bh * snum), 2, vidx).view(-1, 2, n_bh)
+        return r_new, s_new, f_min, f_max
+    def extend_prob(self, x):
+        """Extend CTC prob.
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        """
+        if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
+            # Pad the rest of posteriors in the batch
+            # TODO(takaaki-hori): need a better way without for-loops
+            xlens = [x.size(1)]
+            for i, l in enumerate(xlens):
+                if l < self.input_length:
+                    x[i, l:, :] = self.logzero
+                    x[i, l:, self.blank] = 0
+            tmp_x = self.x
+            xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+            xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+            self.x = torch.stack([xn, xb])  # (2, T, B, O)
+            self.x[:, : tmp_x.shape[1], :, :] = tmp_x
+            self.input_length = x.size(1)
+            self.end_frames = torch.as_tensor(xlens) - 1
+    def extend_state(self, state):
+        """Compute CTC prefix state.
+        :param state    : CTC state
+        :return ctc_state
+        """
+        if state is None:
+            # nothing to do
+            return state
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+            r_prev_new = torch.full(
+                (self.input_length, 2),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            start = max(r_prev.shape[0], 1)
+            r_prev_new[0:start] = r_prev
+            for t in range(start, self.input_length):
+                r_prev_new[t, 1] = r_prev_new[t - 1, 1] + self.x[0, t, :, self.blank]
+            return (r_prev_new, s_prev, f_min_prev, f_max_prev)
+class CTCRescorerLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        encoder_logits: torch.FloatTensor,
+        encoder_output_lens: torch.LongTensor,
+        pad_token_id: int,
+        eos_token_id: int,
+        ctc_margin: int,
+        ctc_weight: float,
+        num_beams: int,
+    ):
+        super().__init__()
+        self.pad_token_id = pad_token_id
+        self.ctc_prefix_scorer = CTCPrefixScoreTH(
+            torch.nn.functional.log_softmax(encoder_logits, dim=-1),
+            encoder_output_lens,
+            pad_token_id,
+            eos_token_id,
+            ctc_margin,
+        )
+        self.ctc_weight = ctc_weight
+        self.ctc_states = None
+        self.num_beams = num_beams
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores[:, self.pad_token_id] = self.ctc_prefix_scorer.logzero
+        if self.ctc_states is not None:
+            self.ctc_states = self.ctc_prefix_scorer.index_select_state(
+                self.ctc_states, input_ids[:, -1].reshape(-1, self.num_beams)
+            )
+        ctc_scores, ctc_states = self.ctc_prefix_scorer(input_ids, self.ctc_states)
+        self.ctc_states = ctc_states
+        next_token_scores = (1 - self.ctc_weight) * scores + self.ctc_weight * ctc_scores
+        # return scores
+        return next_token_scores
+class LogSoftmaxProcessor(LogitsProcessor):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores = torch.nn.functional.log_softmax(scores, dim=-1)
+        return scores

e_branchformer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+""" PyTorch Wav2Vec2-Ebranchformer model."""
+from typing import Optional
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Config,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForPreTraining,
+)
+from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerConfig,
+    Wav2Vec2ConformerEncoder,
+)
+from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerFeedForward as Wav2Vec2EBranchformerFeedForward,
+)
+from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerModel,
+)
+from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerSelfAttention as Wav2Vec2EBranchformerSelfAttention,
+)
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Wav2Vec2EBranchformerConfig(Wav2Vec2ConformerConfig, Wav2Vec2Config):
+    """Config for EBranhformer model extending conformer."""
+    model_type = "wav2vec2-ebranchformer"
+    def __init__(
+        self,
+        ebranchformer_conv_dropout=0.1,
+        csgu_activation="identity",
+        csgu_kernel_size=31,
+        csgu_use_linear_after_conv=False,
+        merge_conv_kernel=31,
+        use_macaron_ff=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # EBranchformer related params
+        self.csgu_kernel_size = csgu_kernel_size
+        self.csgu_activation = csgu_activation
+        self.csgu_conv_dropout = ebranchformer_conv_dropout
+        self.csgu_use_linear_after_conv = csgu_use_linear_after_conv
+        self.merge_conv_kernel = merge_conv_kernel
+        self.use_macaron_ff = use_macaron_ff
+class ConvolutionalSpatialGatingUnit(torch.nn.Module):
+    """Convolutional Spatial Gating Unit (CSGU)."""
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__()
+        n_channels = config.intermediate_size // 2  # split input channels
+        self.norm = torch.nn.LayerNorm(n_channels)
+        self.conv = torch.nn.Conv1d(
+            n_channels,
+            n_channels,
+            config.csgu_kernel_size,
+            1,
+            (config.csgu_kernel_size - 1) // 2,
+            groups=n_channels,
+        )
+        if config.csgu_use_linear_after_conv:
+            self.linear = torch.nn.Linear(n_channels, n_channels)
+        else:
+            self.linear = None
+        if config.csgu_activation == "identity":
+            self.act = torch.nn.Identity()
+        else:
+            self.act = ACT2FN[config.csgu_activation]
+        self.dropout = torch.nn.Dropout(config.csgu_conv_dropout)
+    def forward(self, hidden_states: torch.FloatTensor):
+        """Forward method
+        Args:
+            hidden_states (torch.Tensor): (N, T, D)
+        Returns:
+            out (torch.Tensor): (N, T, D/2)
+        """
+        x_r, x_g = hidden_states.chunk(2, dim=-1)
+        x_g = self.norm(x_g)  # (N, T, D/2)
+        x_g = self.conv(x_g.transpose(1, 2)).transpose(1, 2)  # (N, T, D/2)
+        if self.linear is not None:
+            x_g = self.linear(x_g)
+        x_g = self.act(x_g)
+        hidden_states = x_r * x_g  # (N, T, D/2)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class ConvolutionalGatingMLP(torch.nn.Module):
+    """Convolutional Gating MLP (cgMLP)."""
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__()
+        self.channel_proj1 = torch.nn.Sequential(
+            torch.nn.Linear(config.hidden_size, config.intermediate_size), torch.nn.GELU()
+        )
+        self.csgu = ConvolutionalSpatialGatingUnit(config)
+        self.channel_proj2 = torch.nn.Linear(config.intermediate_size // 2, config.hidden_size)
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.channel_proj1(hidden_states)  # hidden_size -> intermediate_size
+        hidden_states = self.csgu(hidden_states)  # intermediate_size -> intermediate_size/2
+        hidden_states = self.channel_proj2(hidden_states)  # intermediate_size/2 -> hidden_size
+        return hidden_states
+class Wav2Vec2EBranchformerEncoderLayer(nn.Module):
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+        # Feed-forward 1
+        if config.use_macaron_ff:
+            self.ff1 = nn.Sequential(nn.LayerNorm(embed_dim), Wav2Vec2EBranchformerFeedForward(config))
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2EBranchformerSelfAttention(config)
+        # cgMLP
+        self.cgMLP = ConvolutionalGatingMLP(config)
+        self.cgMLP_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.cgMLP_dropout = torch.nn.Dropout(dropout)
+        # Merge
+        self.final_dropout = torch.nn.Dropout(dropout)
+        self.merge_proj = torch.nn.Linear(embed_dim + embed_dim, embed_dim)
+        self.depthwise_conv_fusion = torch.nn.Conv1d(
+            embed_dim + embed_dim,
+            embed_dim + embed_dim,
+            kernel_size=config.merge_conv_kernel,
+            stride=1,
+            padding=(config.merge_conv_kernel - 1) // 2,
+            groups=embed_dim + embed_dim,
+            bias=True,
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        # Feed-forward 2
+        if config.use_macaron_ff:
+            self.ff2 = nn.Sequential(nn.LayerNorm(embed_dim), Wav2Vec2EBranchformerFeedForward(config))
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        # 1. Optional ff1
+        if self.ff1:
+            residual = hidden_states
+            hidden_states = residual + 0.5 * self.ff1(hidden_states)
+        # 2. Split input to three branches
+        residual = hidden_states
+        global_branch = hidden_states
+        local_branch = hidden_states
+        # 3. Self-Attention branch
+        global_branch = self.self_attn_layer_norm(global_branch)
+        global_branch, attn_weigts = self.self_attn(
+            hidden_states=global_branch,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        global_branch = self.self_attn_dropout(global_branch)
+        # 4. cgMLP Branch
+        local_branch = self.cgMLP_layer_norm(local_branch)
+        local_branch = self.cgMLP(local_branch)
+        # 5. Merge operator
+        # a, concat
+        hidden_states = torch.cat([global_branch, local_branch], dim=-1)
+        merge_residual = hidden_states
+        # b, depth-wise conv mixing
+        hidden_states = merge_residual + self.depthwise_conv_fusion(hidden_states.transpose(1, 2)).transpose(1, 2)
+        # c, project back to original size and final dropout
+        hidden_states = self.final_dropout(self.merge_proj(hidden_states))
+        # 6. Add residual
+        hidden_states = residual + hidden_states
+        # 7. Optional ff2
+        if self.ff2:
+            residual = hidden_states
+            hidden_states = residual + 0.5 * self.ff2(hidden_states)
+        # 8. Final layer norm
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states, attn_weigts
+class Wav2Vec2EBranchformerEncoder(Wav2Vec2ConformerEncoder):
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Wav2Vec2EBranchformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.pos_conv_embed = None
+class Wav2Vec2EBranchformerModel(Wav2Vec2ConformerModel):
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__(config)
+        self.encoder = Wav2Vec2EBranchformerEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+class Wav2Vec2EBranchformerForPreTraining(Wav2Vec2ForPreTraining):
+    config_class = Wav2Vec2EBranchformerConfig
+    base_model_prefix = "wav2vec2"
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2EBranchformerModel(config)
+        self.post_init()
+class Wav2Vec2EBranchformerForCTC(Wav2Vec2ForCTC):
+    config_class = Wav2Vec2EBranchformerConfig
+    base_model_prefix = "wav2vec2"
+    def __init__(self, config: Wav2Vec2EBranchformerConfig):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2EBranchformerModel(config)
+        self.post_init()

embeddings.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from torch import nn
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
+        super().__init__()
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+        self.emb_scale = d_proj**0.5
+        self.cutoff_ends = [0] + self.cutoffs
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed = nn.functional.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+                if indices_i.numel() == 0:
+                    continue
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
+                emb_flat.index_copy_(0, indices_i, emb_i)
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
+        embed.mul_(self.emb_scale)
+        return embed
+class PositionalEmbeddingAux(nn.Module):
+    def __init__(self, demb):
+        super().__init__()
+        self.demb = demb
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+        if bsz is not None:
+            return pos_emb[:, None, :].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:, None, :]
+class PositionalEmbedding(PositionalEmbeddingAux):
+    def forward(self, pos_seq, bsz=None):
+        return super().forward(pos_seq.squeeze(0), bsz=bsz).squeeze(1)

extractors.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+class Conv2dFeatureExtractor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            *[
+                nn.Sequential(
+                    nn.Conv2d(
+                        conv_in,
+                        out_channels=conv_out,
+                        kernel_size=(conv_kernel, conv_kernel),
+                        stride=(conv_stride, conv_stride),
+                    ),
+                    ACT2FN[config.feat_extract_activation],
+                )
+                for conv_in, conv_out, conv_kernel, conv_stride in zip(
+                    [1, *config.conv_dim], config.conv_dim, config.conv_kernel, config.conv_stride
+                )
+            ],
+        )
+        linear_in_dim = config.conv_dim[-1] * (((config.second_dim_input_size - 1) // 2 - 1) // 2)
+        self.out = torch.nn.Linear(linear_in_dim, config.hidden_size, bias=True)
+    def forward(self, input_values: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv(input_values[:, None, ...])
+        hidden_states = self.out(hidden_states.transpose(1, 2).flatten(2, 3))
+        return hidden_states.transpose(1, 2)

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "bos_token_id": 0,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "max_length": 512,
+  "pad_token_id": 3,
+  "transformers_version": "4.31.0"
+}

modeling_reguler.py ADDED Viewed

	@@ -0,0 +1,484 @@

+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForSpeechSeq2Seq,
+    GenerationConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    SpeechEncoderDecoderConfig,
+    SpeechEncoderDecoderModel,
+    StoppingCriteriaList,
+)
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutput, Seq2SeqLMOutput
+from transformers.models.speech_encoder_decoder.modeling_speech_encoder_decoder import (
+    shift_tokens_right,
+)
+from transformers.utils import logging
+from .auto_wrappers import CustomAutoModelForCTC
+from .configuration_reguler import JointCTCAttentionEncoderDecoderConfig
+from .ctc_scorer import (
+    CTCRescorerLogitsProcessor,
+    GenerationConfigWithCTC,
+    LogSoftmaxProcessor,
+)
+from .embeddings import AdaptiveEmbedding, PositionalEmbedding
+from .multi_head_gpt2 import GPT2LMMultiHeadModel
+logger = logging.get_logger("transformers")
+def wav2vec2_forward_hidden_return_hook(_: PreTrainedModel, __: Any, kwargs):
+    kwargs["output_hidden_states"] = True
+@dataclass
+class Seq2SeqLMOutputLosses(Seq2SeqLMOutput):
+    enc_loss: Optional[torch.FloatTensor] = None
+    dec_loss: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+def wav2vec2_for_ctc_forward_hook(model: CustomAutoModelForCTC, input: Any, output: CausalLMOutput):
+    if "hidden_states" in output:
+        output.last_hidden_state = output.hidden_states[-1]
+class JointCTCAttentionEncoderDecoder(SpeechEncoderDecoderModel):
+    """Custom model for CTC+Attention loss based on the ESPNet architecture"""
+    config_class = JointCTCAttentionEncoderDecoderConfig
+    base_model_prefix = "joint_aed_ctc_speech-encoder-decoder"
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[PreTrainedModel] = None,
+    ):
+        if config is None and (encoder is None or decoder is None):
+            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+        if config is None:
+            config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+            # initialize with config
+            # make sure input & output embeddings is not tied
+        config.tie_word_embeddings = False
+        super(SpeechEncoderDecoderModel, self).__init__(config)
+        if encoder is None:
+            encoder = CustomAutoModelForCTC.from_config(config.encoder)
+            encoder.register_forward_hook(wav2vec2_for_ctc_forward_hook)
+            encoder.register_forward_pre_hook(wav2vec2_forward_hidden_return_hook, with_kwargs=True)
+        if decoder is None:
+            decoder = AutoModelForCausalLM.from_config(config.decoder)
+        self.encoder = encoder
+        self.decoder = decoder
+        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+            logger.warning(
+                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+                f" {self.config.encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.encoder.config = self.config.encoder
+        self.decoder.config = self.config.decoder
+        # get encoder output hidden size
+        self.encoder_output_dim = getattr(config.encoder, "output_hidden_size", config.encoder.hidden_size)
+        if (
+            self.encoder_output_dim != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            # encoder outputs might need to be projected to different dimension for decoder
+            self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)
+        if self.encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+            )
+        self.enc_loss_weight = config.ctc_weight
+        self.dec_loss_weight = 1 - config.ctc_weight
+        self.lsm_factor = config.lsm_factor
+        if config.shared_lm_head:
+            self.encoder.lm_head.weight = self.decoder.lm_head.weight
+        if (hasattr(config, "decoder_pos_emb_fixed") and config.decoder_pos_emb_fixed) or (
+            hasattr(config.decoder, "pos_emb_fixed") and config.decoder.pos_emb_fixed
+        ):
+            self.decoder.transformer.wte = AdaptiveEmbedding(
+                n_token=config.decoder.vocab_size,
+                d_embed=config.decoder.hidden_size,
+                d_proj=config.decoder.hidden_size,
+                cutoffs=[],
+            )
+            self.decoder.transformer.wpe = PositionalEmbedding(demb=config.decoder.hidden_size)
+            self.decoder.post_init()
+        self.encoder_logits = None
+        self.encoder_output_lens = None
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: str = None,
+        decoder_pretrained_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("decoder_") and argument != "decoder_start_token_id"
+        }
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder.keys():
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder.keys():
+            del kwargs["decoder_" + key]
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+            if "config" not in kwargs_encoder:
+                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+                )
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+                kwargs_encoder["config"] = encoder_config
+            encoder = CustomAutoModelForCTC.from_pretrained(
+                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+            )
+            encoder.register_forward_hook(wav2vec2_for_ctc_forward_hook)
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+                kwargs_decoder["config"] = decoder_config
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+        # instantiate config with corresponding kwargs
+        config = JointCTCAttentionEncoderDecoderConfig.from_encoder_decoder_configs(
+            encoder.config, decoder.config, **kwargs
+        )
+        # make sure input & output embeddings is not tied
+        config.tie_word_embeddings = False
+        return cls(encoder=encoder, decoder=decoder, config=config)
+    def forward(
+        self,
+        inputs: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutputLosses]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+        if encoder_outputs is None:
+            if inputs is None:
+                if input_values is not None and input_features is not None:
+                    raise ValueError("You cannot specify both input_values and input_features at the same time")
+                elif input_values is not None:
+                    inputs = input_values
+                elif input_features is not None:
+                    inputs = input_features
+                else:
+                    raise ValueError("You have to specify either input_values or input_features")
+            encoder_outputs = self.encoder(
+                inputs,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+                **kwargs_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = CausalLMOutput(*encoder_outputs)
+        encoder_hidden_states = encoder_outputs.last_hidden_state
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder_output_dim != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+        # compute correct encoder attention mask
+        if attention_mask is not None:
+            encoder_attention_mask = self.encoder._get_feature_vector_attention_mask(
+                encoder_hidden_states.shape[1], attention_mask
+            )
+        else:
+            encoder_attention_mask = None
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=True
+            if hasattr(self.decoder, "head_weights") and len(self.decoder.head_weights) > 1
+            else output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            **kwargs_decoder,
+        )
+        # Compute loss independent from decoder (as some shift the logits inside them)
+        loss = enc_loss = dec_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(label_smoothing=self.lsm_factor)
+            enc_loss = encoder_outputs.loss if return_dict else encoder_outputs[0]
+            if isinstance(self.decoder, GPT2LMMultiHeadModel) and len(self.decoder.head_weights) > 1:
+                dec_loss = torch.zeros_like(enc_loss)
+                lm_logits_per_layer = []
+                for index, lm_head, lm_weight in zip(
+                    [*self.decoder.head_locations, -1],
+                    [*self.decoder.additional_lm_heads, self.decoder.lm_head],
+                    self.decoder.head_weights,
+                ):
+                    lm_logits = lm_head(decoder_outputs.hidden_states[index])
+                    dec_loss += lm_weight * loss_fct(
+                        lm_logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1)
+                    )
+                    lm_logits_per_layer.append(lm_logits)
+                if self.decoder.config.average_logits:
+                    decoder_outputs.logits = torch.matmul(
+                        torch.stack(lm_logits_per_layer).T,
+                        torch.tensor(self.decoder.head_weights, device=lm_logits_per_layer[-1].device),
+                    ).T
+            else:
+                dec_logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+                dec_loss = loss_fct(dec_logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))
+            loss = self.enc_loss_weight * enc_loss + self.dec_loss_weight * dec_loss
+        if not return_dict:
+            if loss is not None:
+                return (loss,) + decoder_outputs + encoder_outputs
+            else:
+                return decoder_outputs + encoder_outputs
+        return Seq2SeqLMOutputLosses(
+            loss=loss,
+            enc_loss=enc_loss,
+            dec_loss=dec_loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_hidden_states,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            encoder_logits=encoder_outputs.logits,
+        )
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfigWithCTC,
+        input_ids_seq_length: int,
+        encoder_input_ids: torch.LongTensor,
+        prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
+        logits_processor: Optional[LogitsProcessorList],
+    ) -> LogitsProcessorList:
+        processors = super()._get_logits_processor(
+            generation_config, input_ids_seq_length, encoder_input_ids, prefix_allowed_tokens_fn, logits_processor
+        )
+        if generation_config.ctc_weight > 0:
+            if generation_config.num_beams <= 1:
+                processors.append(LogSoftmaxProcessor())
+            self.ctc_rescorer = CTCRescorerLogitsProcessor(
+                self.encoder_logits,
+                self.encoder_output_lens,
+                self.generation_config.pad_token_id,
+                self.generation_config.eos_token_id,
+                self.generation_config.ctc_margin,
+                self.generation_config.ctc_weight,
+                self.generation_config.num_beams,
+            )
+            processors.append(self.ctc_rescorer)
+        return processors
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        self.encoder_output_lens = self.encoder._get_feat_extract_output_lengths(
+            model_kwargs["attention_mask"].sum(dim=1)
+        )
+        model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
+            inputs_tensor, model_kwargs, model_input_name
+        )
+        self.encoder_logits = model_kwargs["encoder_outputs"].logits
+        return model_kwargs
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor) and key != "loss":
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+            model_kwargs["encoder_outputs"].last_hidden_state = model_kwargs[
+                "encoder_outputs"
+            ].last_hidden_state.repeat_interleave(expand_size, dim=0)
+        return input_ids, model_kwargs
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        output = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            assistant_model,
+            streamer,
+            **kwargs,
+        )
+        self.encoder_logits = None
+        self.encoder_output_lens = None
+        return output
+AutoConfig.register("joint_aed_ctc_speech-encoder-decoder", JointCTCAttentionEncoderDecoderConfig)
+AutoModelForSpeechSeq2Seq.register(JointCTCAttentionEncoderDecoderConfig, JointCTCAttentionEncoderDecoder)

multi_head_gpt2.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+class GPT2MultiHeadConfig(GPT2Config):
+    model_type = "gpt2-multi-head"
+    def __init__(
+        self,
+        head_locations=None,
+        head_weights=None,
+        tie_additional_weights=False,
+        average_logits=False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.head_locations = head_locations
+        self.head_weights = head_weights
+        self.tie_additional_weights = tie_additional_weights
+        self.average_logits = average_logits
+class GPT2LMMultiHeadModel(GPT2LMHeadModel):
+    config_class = GPT2MultiHeadConfig
+    def __init__(self, config: GPT2MultiHeadConfig):
+        super().__init__(config)
+        if config.head_locations is not None:
+            if not len(config.head_locations) + 1 == len(config.head_weights):
+                raise ValueError("The number of head locations should be equal to the number of head weights minus 1")
+            self.head_locations = config.head_locations
+            self.additional_lm_heads = nn.ModuleList(
+                [nn.Linear(config.n_embd, config.vocab_size, bias=False) for _ in config.head_locations]
+            )
+            self.head_weights = config.head_weights
+        else:
+            self.head_locations = []
+            self.additional_lm_heads = nn.ModuleList([])
+            self.head_weights = [1.0]
+        self.post_init()
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
+        weights instead.
+        """
+        super().tie_weights()
+        if hasattr(self, "additional_lm_heads") and getattr(self.config, "tie_additional_weights", False):
+            input_embeddings = self.get_input_embeddings()
+            for classifier in self.additional_lm_heads:
+                if self.config.torchscript:
+                    classifier.weight = nn.Parameter(input_embeddings.weight.clone())
+                else:
+                    classifier.weight = input_embeddings.weight
+                if getattr(classifier, "bias", None) is not None:
+                    classifier.bias.data = nn.functional.pad(
+                        classifier.bias.data,
+                        (
+                            0,
+                            classifier.weight.shape[0] - classifier.bias.shape[0],
+                        ),
+                        "constant",
+                        0,
+                    )
+                if hasattr(classifier, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+                    classifier.out_features = input_embeddings.num_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[2]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+        lm_logits = self.lm_head(hidden_states[-1])
+        loss = None
+        if labels is not None:
+            loss = torch.tensor(0.0, device=hidden_states[-1].device)
+            lm_logits = []
+            loss_fct = CrossEntropyLoss()
+            for index, lm_head, lm_weight in zip(
+                [*self.head_locations, -1],
+                [*self.additional_lm_heads, self.lm_head],
+                self.head_weights,
+            ):
+                lm_logits.append(lm_head(hidden_states[index]))
+                # Shift so that tokens < n predict n
+                shift_logits = lm_logits[-1][..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss += lm_weight * loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            if self.config.average_logits:
+                lm_logits = (torch.vstack(lm_logits) * torch.tensor(self.head_weights)).mean(dim=0)
+            else:
+                lm_logits = lm_logits[-1]
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239d2cf3c581c86eff7d96eb7eb3300a07948030cc7d56cd42a4af363a66a8f6
+size 698154846

residual_clasiffier_gpt2.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+class GPT2ResidualsLMHeadConfig(GPT2Config):
+    model_type = "gpt2-residuals-head"
+    def __init__(self, connected_residuals=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.connected_residuals = connected_residuals
+class GPT2ResidualsLMHeadModel(GPT2LMHeadModel):
+    config_class = GPT2ResidualsLMHeadConfig
+    def __init__(self, config: GPT2ResidualsLMHeadConfig):
+        super().__init__(config)
+        self.connected_residuals = config.connected_residuals
+        self.lm_head = nn.Linear(config.n_embd * len(self.connected_residuals), config.vocab_size, bias=False)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[2]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+        hidden_states = torch.concat([hidden_states[index] for index in self.connected_residuals], dim=-1)
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )