YourMT3

Running

App Files Files Community

mimbres commited on Aug 1

Commit

a03c9b4

•

1 Parent(s): 7888f4e

.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -1
amt/src/.coverage +0 -0
amt/src/.coveragerc +5 -0
amt/src/config/.DS_Store +0 -0
amt/src/config/config.py +272 -0
amt/src/config/data_presets.py +811 -0
amt/src/config/task.py +119 -0
amt/src/config/vocabulary.py +384 -0
amt/src/extras/.DS_Store +0 -0
amt/src/extras/Dockerfile +18 -0
amt/src/extras/check_drum_channel_slakh.py +24 -0
amt/src/extras/dataset_mutable_var_sanity_check.py +81 -0
amt/src/extras/datasets_eval_testing.py +42 -0
amt/src/extras/demo_cross_augmentation.py +69 -0
amt/src/extras/demo_intra_augmentation.py +52 -0
amt/src/extras/download_mirst500.py +50 -0
amt/src/extras/fig/label_smooth_interval_of_interest.png +0 -0
amt/src/extras/fig/pitchshift_benchnmark.png +0 -0
amt/src/extras/fig/pitchshift_stretch_and_resampler_process_time.png +0 -0
amt/src/extras/inspecting_slakh_bass.py +34 -0
amt/src/extras/install_deepspeed.md +28 -0
amt/src/extras/label_smoothing.py +67 -0
amt/src/extras/multi_channel_seqlen_stats.py +177 -0
amt/src/extras/npy_speed_benchmark.py +187 -0
amt/src/extras/perceivertf_inspect.py +640 -0
amt/src/extras/perceivertf_multi_inspect.py +778 -0
amt/src/extras/pitch_shift_benchmark.py +167 -0
amt/src/extras/remove_silence_musicnet_midi.py +32 -0
amt/src/extras/rotary_positional_embedding.py +191 -0
amt/src/extras/run_spleeter_mir1k.sh +17 -0
amt/src/extras/run_spleeter_mirst500.sh +13 -0
amt/src/extras/run_spleeter_mirst500_cmedia.sh +13 -0
amt/src/extras/swap_channel.py +122 -0
amt/src/extras/t5_dev.py +41 -0
amt/src/extras/t5perceiver.py +443 -0
amt/src/extras/unimax_sampler/README.md +45 -0
amt/src/extras/unimax_sampler/demo.py +15 -0
amt/src/extras/unimax_sampler/unimax_sampler.py +168 -0
amt/src/install_dataset.py +285 -0
amt/src/model/RoPE/RoPE.py +306 -0
amt/src/model/conformer_helper.py +169 -0
amt/src/model/conformer_mod.py +439 -0
amt/src/model/conv_block.py +217 -0
amt/src/model/ff_layer.py +238 -0
amt/src/model/init_train.py +281 -0
amt/src/model/lm_head.py +40 -0
amt/src/model/lr_scheduler.py +91 -0
amt/src/model/ops.py +111 -0
amt/src/model/optimizers.py +218 -0
amt/src/model/perceiver_helper.py +290 -0

.gitignore CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- amt/
2	examples/


1	+ amt/logs/
2	examples/

amt/src/.coverage ADDED Viewed

Binary file (53.2 kB). View file

amt/src/.coveragerc ADDED Viewed

	@@ -0,0 +1,5 @@

+[run]
+omit =
+    train.py
+    test.py
+    install*.py

amt/src/config/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

amt/src/config/config.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""config.py"""
+import numpy as np
+# yapf: disable
+"""
+audio_cfg:
+- Used by 'ymt3' to create a spectrogram layer.
+- Input shape of model is determined by audio_cfg.
+- 'train.py' arguments can override these defaults.
+"""
+audio_cfg = {
+    # Overwrittable by args in train.py
+    "codec": "melspec",  # {melspec, spec} melspec for MT3, spec for PerceiverTF
+    "hop_length": 128,  # {128, 300} 128 for MT3, 300 for PerceiverTF
+    # Shared audio parameters
+    "audio_backend": "torchaudio",  # {torchaudio, nnAudio}
+    "sample_rate": 16000,
+    "input_frames": 32767, # number of input frames (~=2.048 s), determining in-/output shape of front layers.
+    "n_fft": 2048,
+    "n_mels": 512,  # only for melspec
+    "f_min": 50.0,
+    "f_max": 8000.0,
+} # TODO: currently dataloader is not updated by "input_frames"
+"""
+model_cfg:
+- Encoder type dictates use of T5_CFG or PERCEIVER_TF_CFG.
+- 'train.py' arguments can override these defaults.
+"""
+model_cfg = {
+    "encoder_type": "t5",  # {"t5", "perceiver-tf", "conformer"}
+    "decoder_type": "t5", # {"t5", "multi-t5"}
+    "pre_encoder_type": "default",  # {None, "default", "conv", "conv1d", "conv2d_avpt"} by default, t5:None, perceiver:conv.
+    "pre_encoder_type_default": {"t5": None, "perceiver-tf": "conv", "conformer": None},
+    "pre_decoder_type": "default", # {None, 'linear', 'conv1', 'mlp', 'group_linear'} see model/projection_layer.py
+    "pre_decoder_type_default": { # [enc_type][dec_type]
+        "t5": {"t5": None,},
+        "perceiver-tf": {"t5": "linear", "multi-t5": "mc_shared_linear"},
+        "conformer": {"t5": None,},
+    },
+    "conv_out_channels": 128, # number of filters for 'conv' pre_encoder. Otherwise ignored.
+    "t5_basename": "google/t5-v1_1-small",
+    "pretrained": False, # bool, if True, load pretrained weights from t5_basename. Mismatched layers are ignored.
+    "use_task_conditional_encoder": True, # True by default, but default task is None. So not activated by default.
+    "use_task_conditional_decoder": True, # True by default, but default task is None. So not activated by default.
+    "d_feat": "auto", # Input audio feature dimension for encoder. Automatically inferred by audio_cfg and existence of pre_encoders.
+    "tie_word_embeddings": True, # If True, weights of embed_tokens and lm_head are tied for stabilizing gradients.
+    "vocab_size": "auto", # int or "auto", automatically inferred by task manager.
+    "num_max_positions": "auto", # int or "auto". Length of positional encoding. Automatically inferred by "feat_length", "event_length" and task_manager.max_task_token_length.
+    # 'vocab_size', 'tie_word_embeddings' and 'num_max_positions' are auto-copied to encoder and decoder configs in the below.
+    "encoder": {
+        "t5": {
+            "d_model": 512, # Hidden size of T5 encoder.
+            "num_heads": 6,
+            "num_layers": 8,
+            "dropout_rate": 0.05,
+            "position_encoding_type": "sinusoidal", # {'sinusoidal', 'trainable'}.
+            "ff_widening_factor": 2, # wideening factor for MLP/MoE layers. Default is 2 in T5.
+            "ff_layer_type": "t5_gmlp", # {'t5_gmlp', 'moe', 'mlp', 'gmlp'}. 'moe' for mixture of experts, 'mlp' for standard transformer dense layer, 'gmlp' for simple gated MLP.
+        },
+        "perceiver-tf": {
+            "num_latents": 24, # number of latents in Perceiver. 24 in perceiver-tf paper.
+            "d_latent": 128, # latent dimension of Perceiver. 128 in perceiver-tf paper.
+            "d_model": "q", # int or "q" or "kv". Inner-dim of sca and local/temporal self-att.
+                # "q" follows "latent_dim". "kv" follows  "d_feat". Best practice is to inc-/decrease 'd_latent', instead of 'd_model'.
+            "num_blocks": 3, # number of Perceiver-TF blocks in encoder. L in the paper.
+            "num_local_transformers_per_block": 2, # N in the paper.
+            "num_temporal_transformers_per_block": 2,  # M in the paper.
+            "sca_use_query_residual": False,
+            "dropout_rate": 0.1,
+            "position_encoding_type": "trainable", # {'trainable', 'rotary', 'alibi', 'alibit', None, 'tkd','td', 'tk', 'kdt'}. alibit is alibi with trainable slopes.
+            "attention_to_channel": True, # Whether to use channel attention in sca.
+            "layer_norm_type": "layer_norm", # {'layer_norm', 'rms_norm'}
+            "ff_layer_type": "mlp", # {'moe', 'mlp', gmlp}. 'moe' for mixture of experts, 'mlp' for standard transformer dense layer, 'gmlp' for simple gated MLP.
+            "ff_widening_factor": 1, # wideening factor for MLP/MoE layers. Default is 1.
+            "moe_num_experts": 4, # number of experts in MoE layer. Default is 4. Disabled if ff_layer_type is not 'moe'.
+            "moe_topk": 2, # top-k routing in MoE layer. Default is 2. Disabled if ff_layer_type is not 'moe'.
+            "hidden_act": 'gelu', # activation function in MLP/MoE layer. Default is 'gelu'. {'gelu', 'silu', 'relu'}
+            "rotary_type_sca": "pixel", # {'l'|'lang', 'p'|'pixel'}. Default is 'pixel'.
+            "rotary_type_latent": "pixel", # {'l'|'lang', 'p'|'pixel'}. Default is 'pixel'.
+            "rotary_type_temporal": "lang", # {'l'|'lang', 'p'|'pixel'}. Default is 'lang'.
+            "rotary_apply_to_keys": False, # Whether to apply rotary to keys. Default is False.
+            "rotary_partial_pe": False, # Whether to use partial positional encoding. Default is False.
+        },
+        "conformer": {
+            "d_model": 512, # Hidden size of T5 encoder.
+            "intermediate_size": 512, # or 2048. size of the intermediate feed forward layer in each T5Block
+            "num_heads": 8,
+            "num_layers": 8,
+            "dropout_rate": 0.1,
+            "layerdrop": 0.1, # see https://arxiv.org/abs/1909.11556
+            "position_encoding_type": "rotary", # {'rotary', 'relative'}.
+            "conv_dim": (512, 512, 512, 512, 512, 512, 512),
+            "conv_stride": (5, 2, 2, 2, 2, 2, 2),
+            "conv_kernel": (10, 3, 3, 3, 3, 3, 3),
+            "conv_depthwise_kernel_size": 31,
+        },
+    },
+    "decoder": {
+        "t5": {
+            "d_model": 512, # Hidden size of T5 encoder. If encoder has lower dim, it is projected to this dim for enc-dec cross att.
+            "num_heads": 6,
+            "num_layers": 8,
+            "dropout_rate": 0.05,
+            "position_encoding_type": "sinusoidal", # {'sinusoidal', 'trainable'}.
+            "ff_widening_factor": 2, # wideening factor for MLP/MoE layers. Default is 2 in T5.
+            "ff_layer_type": "t5_gmlp", # {'t5_gmlp', 'moe', 'mlp', 'gmlp'}. 'moe' for mixture of experts, 'mlp' for standard transformer dense layer, 'gmlp' for simple gated MLP.
+        },
+        "multi-t5": {
+            "d_model": 512, # Hidden size of T5 encoder. Recommended: {256 or 512}
+            "num_heads": 6,
+            "num_layers": 8,
+            "dropout_rate": 0.05,
+            "position_encoding_type": "sinusoidal", # {'sinusoidal', 'trainable'}.
+            "ff_widening_factor": 2, # wideening factor for MLP/MoE layers. Default is 2 in T5.
+            "ff_layer_type": "t5_gmlp", # {'t5_gmlp', 'moe', 'mlp', 'gmlp'}. 'moe' for mixture of experts, 'mlp' for standard transformer dense layer, 'gmlp' for simple gated MLP.
+            "num_channels": 13,
+        },
+    },
+    "feat_length": "auto", # Input audio feature length for encoder. Automatically inferred by audio_cfg.
+        # mt3: 256 time steps
+    "event_length": 1024,  # max length of event tokens excluding task tokens <-- 128 for multi-t5
+    "init_factor": 1.0, # initialization factor for embedding layers
+}
+# yapf: enable
+shared_cfg = {
+    "PATH": {
+        "data_home": "../../data", # path to the data directory. If using relative path, it is relative to /src directory.
+    },
+    "BSZ": { # global batch size is local_bsz * n_GPUs in DDP mode
+        "train_sub": 12, #20, # sub-batch size is per CPU worker
+        "train_local": 24, #40, # local batch size is per GPU in DDP mode
+        "validation": 64, # validation batch size is per GPU in DDP mode
+        "test": 64,
+    },
+    "AUGMENTATION": {
+        "train_random_amp_range": [0.8, 1.1], # min and max amplitude scaling factor
+        "train_stem_iaug_prob": 0.7, # probability of stem activation in intra-stem augmentation
+        "train_stem_xaug_policy": {
+            "max_k": 3,
+            "tau": 0.3,
+            "alpha": 1.0,
+            "max_subunit_stems": 12, # the number of subunit stems to be reduced to this number of stems
+            "p_include_singing": None,  # NOT IMPLEMENTED; probability of including singing for cross augmented examples. if None, use base probaility.
+            "no_instr_overlap": True,
+            "no_drum_overlap": True,
+            "uhat_intra_stem_augment": True,
+        },
+        "train_pitch_shift_range": [-2, 2], # [min, max] in semitones. None or [0, 0] for no pitch shift.
+    },
+    "DATAIO": { # do not set `shuffle` here.
+        "num_workers": 4, # num_worker is per GPU in DDP mode
+        "prefetch_factor": 2, #2,
+        "pin_memory": True,
+        "persistent_workers": False,
+    },
+    "CHECKPOINT": {
+        "save_top_k": 4, # max top k checkpoints to save
+        "monitor": 'validation/macro_onset_f',
+        "mode": 'max',
+        # "every_n_epochs": 20, # only working when check_val_every_n_epoch is 0
+        "save_last": True, # save last model
+        "filename": "{epoch}-{step}",
+    },
+    "TRAINER": { # do not coverwrite args in this section
+        "limit_train_batches": 1.0, # How much of training dataset to check (float = fraction, int = num_batches)
+        "limit_val_batches": 1.0,
+        "limit_test_batches": 1.0,
+        "gradient_clip_val": 1.0, # {0 or None} means don't clip.
+        "accumulate_grad_batches": 1, #1, # Accumulates grads every k batches. If set to 1, no effect.
+        "check_val_every_n_epoch": 1, #5, 1 for very large dataset such as EGMD
+        "num_sanity_val_steps": 0,
+    },
+    "WANDB": {
+        "save_dir": "../logs",
+        "cache_dir": "../logs/.wandb_cache",
+        "resume": "allow",
+        "anonymous": "allow", # {never, allow, must}
+        "mode": "online", # {online, offline, disabled}
+    },
+    "LR_SCHEDULE": {
+        # "scheduler_type": "cosine", # {legacy, cosine, constant}
+        "warmup_steps": 1000, # only for cosine scheduler, legacy scheduler follows T5's legacy schedule
+        "total_steps": 100000, # argparser of train.py can overwrite this
+        "final_cosine": 1e-5, # only for cosine scheduler
+    },
+    "TOKENIZER": {
+        "max_shift_steps": "auto", # max number of shift steps in the model. (int) or "auto". If "auto", it is set by audio_cfg["input_frames"] and shift_steps_ms. 206 with default setup.
+        "shift_step_ms": 10, # shift step in ms
+    },
+}
+T5_BASE_CFG = {
+    "google/t5-v1_1-small": {
+        "architectures": ["T5ForConditionalGeneration"],
+        "d_ff":
+            1024,  # size of the intermediate feed forward layer in each T5Block. Can be overwrten by ff_widening_factor in model_cfg.
+        "d_kv": 64,  # d_kv has to be equal to d_model // num_heads.
+        # "d_model": 512,  # encoder hiddnen size, defined by model_cfg
+        "decoder_start_token_id": 0,
+        "dense_act_fn": "gelu_new",
+        # "dropout_rate": 0.05,  # can be overwritten by args in ymt3
+        "eos_token_id": 1,
+        "feed_forward_proj": "gated-gelu",
+        "initializer_factor": 1.0,
+        "is_encoder_decoder": True,
+        "is_gated_act": True,
+        "layer_norm_epsilon": 1e-06,
+        "model_type": "t5",
+        # "num_decoder_layers": 8, # defined by model_cfg
+        # "num_heads": 6,  # defined by model_cfg
+        # "num_layers": 8,  # defined by model_cfg
+        "output_past": True,
+        "pad_token_id": 0,
+        "relative_attention_num_buckets": 32,
+        # "tie_word_embeddings": True,
+        "use_cache": True,
+        # "vocab_size": 1391 # vocab_size is automatically set by the task manager...
+    },
+    "google/t5-efficient-small": {
+        "architectures": ["T5ForConditionalGeneration"],
+        "d_ff": 2048,
+        "d_kv": 64,
+        "d_model": 512,
+        "decoder_start_token_id": 0,
+        "dropout_rate": 0.1,
+        "eos_token_id": 1,
+        "feed_forward_proj": "relu",
+        "initializer_factor": 1.0,
+        "is_encoder_decoder": True,
+        "layer_norm_epsilon": 1e-06,
+        "model_type": "t5",
+        "num_decoder_layers": 6,
+        "num_heads": 8,
+        "num_layers": 6,
+        "pad_token_id": 0,
+        "relative_attention_num_buckets": 32,
+        "torch_dtype": "float32",
+        "transformers_version": "4.17.0.dev0",
+        "use_cache": True,
+    },
+}
+# yapf: enable
+DEEPSPEED_CFG = {
+    "zero_allow_untested_optimizer": True,
+    "optimizer": {
+        "type": "adam",
+        "params": {
+            "lr": 1e-4,
+            "betas": [0.998, 0.999],
+            "eps": 1e-3,
+            "weight_decay": 0.001,
+            "adam_w_mode": True,
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "last_batch_iteration": -1,
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 100,
+        },
+    },
+    "zero_optimization": {
+        "stage": 0,  #0,1,2,3
+        # "offload_optimizer":
+        #     False,  # Enable Offloading optimizer state/calculation to the host CPU
+    },
+}

amt/src/config/data_presets.py ADDED Viewed

	@@ -0,0 +1,811 @@

+""" data.py:
+Data presets for training and evaluation.
+Single Presets:
+    musicnet_mt3
+    musicnet_em
+    musicnet_thickstun
+    slakh
+    guitarset
+    ...
+Multi Presets:
+    all_mmegs
+    ...
+"""
+from config.vocabulary import *
+from config.vocabulary import drum_vocab_presets, program_vocab_presets
+from utils.utils import deduplicate_splits, merge_splits, merge_vocab
+data_preset_single_cfg = {
+    "musicnet_mt3": {
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_mt3",
+            "validation_split": "validation_mt3_acoustic",
+            "test_split": "test_mt3_acoustic",
+            "has_stem": False,
+    },
+    "musicnet_mt3_synth_only": { # sanity-check
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_mt3_synth",
+            "validation_split": "validation_mt3_synth",
+            "test_split": "test_mt3_acoustic",
+            "has_stem": False,
+    },
+    "musicnet_mt3_em": {
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_mt3_em",
+            "validation_split": "validation_mt3_em",
+            "test_split": "test_mt3_em",
+            "has_stem": False,
+    },
+    "musicnet_thickstun": { # exp4
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_thickstun",
+            "validation_split": "test_thickstun",
+            "test_split": "test_thickstun",
+            "has_stem": False,
+    },
+    "musicnet_thickstun_em": { # NOTE: this is not the use of external 'synth' in the paper, but the use of 'synth' within the dataset
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_thickstun_em",
+            "validation_split": "test_thickstun_em",
+            "test_split": "test_thickstun_em",
+            "has_stem": False,
+    },
+    "musicnet_thickstun_ext": { # exp4
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_thickstun",
+            "validation_split": "test_thickstun_ext",
+            "test_split": "test_thickstun_ext",
+            "has_stem": False,
+    },
+    "musicnet_thickstun_ext_em": { # NOTE: this is not the use of external 'synth' in the paper, but the use of 'synth' within the dataset
+            "eval_vocab": [MUSICNET_INSTR_CLASS],
+            "dataset_name": "musicnet",
+            "train_split": "train_thickstun_em",
+            "validation_split": "test_thickstun_ext_em",
+            "test_split": "test_thickstun_ext_em",
+            "has_stem": False,
+    },
+    "maps_default": {
+            "eval_vocab": [PIANO_SOLO_CLASS],
+            "dataset_name": "maps",
+            "train_split": "train",
+            "validation_split": "test",
+            "test_split": "test",
+            "has_stem": False,
+    },
+    "maps_all": {
+            "eval_vocab": [None],
+            "dataset_name": "maps",
+            "train_split": "all",
+            "validation_split": None,
+            "test_split": None,
+            "has_stem": False,
+    },
+    "maestro": {
+            "eval_vocab": [PIANO_SOLO_CLASS],
+            "dataset_name": "maestro",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "test",
+            "has_stem": False,
+    },
+    "maestro_final": {
+            "eval_vocab": [PIANO_SOLO_CLASS],
+            "dataset_name": "maestro",
+            "train_split": merge_splits(["train", "validation"], dataset_name="maestro"),
+            "validation_split": "test",
+            "test_split": "test",
+            "has_stem": False,
+    },
+    "guitarset": { # 4 random players for train, 1 for valid, and 1 for test
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "test",
+            "has_stem": False,
+    },
+    "guitarset_pshift": { # guitarset + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": "train_pshift",
+            "validation_split": "validation",
+            "test_split": "test",
+            "has_stem": False,
+    },
+    "guitarset_progression": { # progression 1 and 2 as train, progression 3 as test
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["progression_1", "progression_2"], dataset_name="guitarset"),
+            "validation_split": "progression_3",
+            "test_split": "progression_3",
+            "has_stem": False,
+    },
+    "guitarset_progression_pshift": { # guuitarset_progression + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["progression_1_pshift", "progression_2_pshift"], dataset_name="guitarset"),
+            "validation_split": "progression_3",
+            "test_split": "progression_3",
+            "has_stem": False,
+    },
+    "guitarset_minus_bn": { # guuitarset_style + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["Funk_pshift", "SS_pshift", "Jazz_pshift", "Rock_pshift"],
+                                         dataset_name="guitarset"),
+            "validation_split": "BN",
+            "test_split": "BN",
+            "has_stem": False,
+    },
+    "guitarset_minus_funk": { # guuitarset_style + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["BN_pshift", "SS_pshift", "Jazz_pshift", "Rock_pshift"],
+                                         dataset_name="guitarset"),
+            "validation_split": "Funk",
+            "test_split": "Funk",
+            "has_stem": False,
+    },
+    "guitarset_minus_ss": { # guuitarset_style + pitch shift
+            "eval_vocab": GUITAR_SOLO_CLASS,
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["BN_pshift", "Funk_pshift", "Jazz_pshift", "Rock_pshift"],
+                                         dataset_name="guitarset"),
+            "validation_split": "SS",
+            "test_split": "SS",
+            "has_stem": False,
+    },
+    "guitarset_minus_jazz": { # guuitarset_style + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["BN_pshift", "Funk_pshift", "SS_pshift", "Rock_pshift"],
+                                         dataset_name="guitarset"),
+            "validation_split": "Jazz",
+            "test_split": "Jazz",
+            "has_stem": False,
+    },
+    "guitarset_minus_rock": { # guuitarset_style + pitch shift
+            "eval_vocab": [GUITAR_SOLO_CLASS],
+            "dataset_name": "guitarset",
+            "train_split": merge_splits(["BN_pshift", "Funk_pshift", "SS_pshift", "Jazz_pshift"],
+                                         dataset_name="guitarset"),
+            "validation_split": "Rock",
+            "test_split": "Rock",
+            "has_stem": False,
+    },
+    "guitarset_all": {
+            "eval_vocab": [None],
+            "dataset_name": "guitarset",
+            "train_split": "all",
+            "validation_split": None,
+            "test_split": None,
+            "has_stem": False,
+    },
+    "enstdrums_dtp": {
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "enstdrums",
+            "train_split": merge_splits(["drummer_1_dtp", "drummer_2_dtp", "drummer_1_dtp", "drummer_2_dtp"], dataset_name="enstdrums"),
+            "validation_split": "drummer_1_dtp", # for sanity check
+            "test_split": "drummer_3_dtp",
+            "has_stem": False,
+    },
+    "enstdrums_dtm": {
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "enstdrums",
+            "train_split": merge_splits(["drummer_1_dtm", "drummer_2_dtm", "drummer_1_dtp", "drummer_2_dtp"], dataset_name="enstdrums"),
+            "validation_split": "drummer_3_dtm_r2", # 0.6 * drum
+            "test_split": "drummer_3_dtm_r1", # 0.75 * drum
+            "has_stem": True,
+    },
+    "enstdrums_random_dtm": { # single dataset training as a denoising ADT model
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "enstdrums",
+            "train_split": "train_dtm",
+            "validation_split": "validation_dtm",
+            "test_split": "test_dtm",
+            "has_stem": True,
+    },
+    "enstdrums_random": { # multi dataset training with random split of 70:15:15
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "enstdrums",
+            "train_split": "train_dtp",
+            "validation_split": "test_dtm",
+            "test_split": "test_dtm",
+            "has_stem": True,
+    },
+    "enstdrums_random_plus_dtd": { # multi dataset training plus dtd
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "enstdrums",
+            "train_split": merge_splits(["train_dtp", "all_dtd"], dataset_name="enstdrums"),
+            "validation_split": "test_dtm",
+            "test_split": "test_dtm",
+            "has_stem": True,
+    },
+    "mir_st500": {
+            "eval_vocab": [SINGING_SOLO_CLASS],
+            "dataset_name": "mir_st500",
+            "train_split": "train_stem",
+            "validation_split": "test",
+            "test_split": "test",
+            "has_stem": True,
+    },
+    "mir_st500_voc": {
+            "eval_vocab": [SINGING_SOLO_CLASS],
+            "dataset_name": "mir_st500",
+            "train_split": "train_vocal",
+            "validation_split": "test_vocal",
+            "test_split": "test_vocal",
+            "has_stem": False,
+    },
+    "mir_st500_voc_debug": { # using train_vocal for test (for debugging)
+            "eval_vocab": [SINGING_SOLO_CLASS],
+            "dataset_name": "mir_st500",
+            "train_split": "train_vocal",
+            "validation_split": "test_vocal",
+            "test_split": "train_vocal",
+            "has_stem": False,
+    },
+    "slakh": {
+            "eval_vocab": [GM_INSTR_CLASS],
+            "eval_drum_vocab": drum_vocab_presets["gm"],
+            "dataset_name": "slakh",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "test",
+            "has_stem": True,
+    },
+    "slakh_final": {
+            "eval_vocab": [GM_INSTR_CLASS],
+            "eval_drum_vocab": drum_vocab_presets["gm"],
+            "dataset_name": "slakh",
+            "train_split": merge_splits(["train", "validation"], dataset_name="slakh"),
+            "validation_split": "test",
+            "test_split": "test",
+            "has_stem": True,
+    },
+    "rwc_pop_bass": {
+            "eval_vocab": [BASS_SOLO_CLASS],
+            "add_pitch_class_metric": ["Bass"],
+            "dataset_name": "rwc_pop",
+            "train_split": None,
+            "validation_split": "bass",
+            "test_split": "bass",
+            "has_stem": False,
+    },
+    "rwc_pop_full": {
+            "eval_vocab": [GM_INSTR_CLASS_PLUS],
+            "add_pitch_class_metric": list(GM_INSTR_CLASS_PLUS.keys()),
+            "dataset_name": "rwc_pop",
+            "train_split": None,
+            "validation_split": "full",
+            "test_split": "full",
+            "has_stem": False,
+    },
+    "egmd": {
+            "eval_vocab": [None],
+            "eval_drum_vocab": drum_vocab_presets["ksh"],
+            "dataset_name": "egmd",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "test_reduced", # EGMD has 5000+ test files, so we reudce it to 200 files to save time
+            # "train_limit_num_files": 4402, #8804, # 17608, # limit the number of files for training to random choice of half.
+            "has_stem": False,
+    },
+    "urmp": {
+            "eval_vocab": [GM_INSTR_CLASS],
+            "dataset_name": "urmp",
+            "train_split": "train",
+            "validation_split": "test",
+            "test_split": "test",
+            "has_stem": True,
+    },
+    "cmedia": {
+            "eval_vocab": [SINGING_SOLO_CLASS],
+            "dataset_name": "cmedia",
+            "train_split": "train_stem",
+            "validation_split": "train",
+            "test_split": "train",
+            "has_stem": True,
+    },
+    "cmedia_voc": {
+            "eval_vocab": [SINGING_SOLO_CLASS],
+            "dataset_name": "cmedia",
+            "train_split": "train_vocal",
+            "validation_split": "train_vocal",
+            "test_split": "train_vocal",
+            "has_stem": False,
+    },
+    "idmt_smt_bass": {
+            "eval_vocab": [BASS_SOLO_CLASS],
+            "dataset_name": "idmt_smt_bass",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "validation",
+            "has_stem": False,
+    },
+    "geerdes": { # full mix dataset for evaluation
+            "eval_vocab": [GM_INSTR_CLASS_PLUS],
+            "dataset_name": "geerdes",
+            "train_split": None,
+            "validation_split": None,
+            "test_split": "all",
+            "has_stem": False,
+    },
+    "geerdes_sep": { # Using vocal/accomp separation for evalutation
+            "eval_vocab": [GM_INSTR_CLASS_PLUS],
+            "dataset_name": "geerdes",
+            "train_split": None,
+            "validation_split": None,
+            "test_split": "all_sep",
+            "has_stem": False,
+    },
+    "geerdes_half": { # Using half dataset for train/val
+            "eval_vocab": [GM_INSTR_CLASS_PLUS],
+            "dataset_name": "geerdes",
+            "train_split": "train",
+            "validation_split": "validation",
+            "test_split": "validation",
+            "has_stem": False,
+    },
+    "geerdes_half_sep": { # Using half dataset with vocal/accomp separation for train/val
+            "eval_vocab": [GM_INSTR_CLASS_PLUS],
+            "dataset_name": "geerdes",
+            "train_split": "train_sep",
+            "validation_split": "validation_sep",
+            "test_split": "validation_sep",
+            "has_stem": False,
+    },
+}
+data_preset_multi_cfg = {
+    "musicnet_mt3_em_synth_plus_maps": {
+        "presets": ["musicnet_mt3_em_synth", "maps_all"],
+        "weights": [0.6, 0.4],
+        "eval_vocab": [MUSICNET_INSTR_CLASS],
+    },
+    "musicnet_em_synth_table2_plus_maps": {
+        "presets": ["musicnet_em_synth_table2", "maps_all"],
+        "weights": [0.6, 0.4],
+        "eval_vocab": [MUSICNET_INSTR_CLASS],
+    },
+    "musicnet_em_synth_table2_plus_maps_multi": {
+        "presets": ["musicnet_em_synth_table2", "maps_default"],
+        "weights": [0.6, 0.4],
+        "eval_vocab": [MUSICNET_INSTR_CLASS],
+    },
+    "guitarset_progression_plus_maps": {
+        "presets": ["guitarset_progression", "maps_all"],
+        "weights": [0.5, 0.5],
+        "eval_vocab": [GUITAR_SOLO_CLASS],
+    },
+    "guitarset_pshift_plus_maps": {
+        "presets": ["guitarset_pshift", "maps_default"],
+        "weights": [0.6, 0.4],
+        "eval_vocab": [merge_vocab([GUITAR_SOLO_CLASS, PIANO_SOLO_CLASS])],
+    },
+    "guitarset_pshift_plus_musicnet_thick": {
+        "presets": ["guitarset_pshift", "musicnet_thickstun_em"],
+        "weights": [0.5, 0.5],
+        "eval_vocab": [merge_vocab([GUITAR_SOLO_CLASS, PIANO_SOLO_CLASS])],
+    },
+    "multi_sanity_check": {
+        "presets": ["musicnet_mt3_synth_only", "musicnet_mt3_synth_only"],
+        "weights": [0.6, 0.4],
+        "eval_vocab": [MUSICNET_INSTR_CLASS],
+    },
+    "all_mmegs": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp", "guitarset_pshift"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_gt_cv0": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp", "guitarset_minus_bn"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_gt_cv1": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_minus_funk"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_gt_cv2": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp", "guitarset_minus_ss"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_gt_cv3": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_minus_rock"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_gt_cv4": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_minus_jazz"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_enstdrums_random": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_random", "guitarset"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.2, 0.2],
+        "eval_vocab": [None] * 5,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_plus_egmd": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_random_plus_dtd",
+            "guitarset", "egmd"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.1, 0.1, 0.2],
+        "eval_vocab": [None] * 6,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_dtp_egmd": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp", "guitarset", "egmd"
+        ],
+        "weights": [0.2, 0.2, 0.2, 0.1, 0.1, 0.2],
+        "eval_vocab": [None] * 6,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_weighted_slakh": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp", "guitarset_pshift", "egmd"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.2],
+        "eval_vocab": [None] * 6,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_weighted_mt3": { # for comparison with MT3
+        "presets": [
+            "slakh", "musicnet_mt3", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression_pshift", "egmd"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.2],
+        "eval_vocab": [None] * 6,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_weighted_mt3_em": { # musicnet_mt3_em
+        "presets": [
+            "slakh", "musicnet_mt3_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression_pshift", "egmd"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.2],
+        "eval_vocab": [None] * 6,  # None means instrument-agnoßstic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_urmp": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp"
+        ],
+        "weights": [0.5, 0.2, 0.1, 0.05, 0.05, 0.05, 0.1],
+        "eval_vocab": [None] * 7,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_urmp_mt3": { # for comparison with MT3 including URMP
+        "presets": [
+            "slakh", "musicnet_mt3", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression", "egmd", "urmp"
+        ],
+        "weights": [0.5, 0.2, 0.1, 0.05, 0.05, 0.0125, 0.1],
+        "eval_vocab": [None] * 7,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_urmp_mt3_em": { # musicnet_mt3_em including URMP
+        "presets": [
+            "slakh", "musicnet_mt3_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression", "egmd", "urmp"
+        ],
+        "weights": [0.5, 0.2, 0.1, 0.05, 0.05, 0.0125, 0.1],
+        "eval_vocab": [None] * 7,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_maestro": { # including Mestro and URMP
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.125, 0.075, 0.025, 0.01, 0.1, 0.1],
+        "eval_vocab": [None] * 8,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_maestro_mt3": { # for comparison with MT3 including URMP
+        "presets": [
+            "slakh", "musicnet_mt3", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.0125, 0.1, 0.1],
+        "eval_vocab": [None] * 8,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_maestro_mt3_em": { # musicnet_mt3_em including URMP
+        "presets": [
+            "slakh", "musicnet_mt3_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.0125, 0.1, 0.1],
+        "eval_vocab": [None] * 8,  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "singing_v1": { # slakh + mir_st500 without spleeter
+        "presets": ["slakh", "mir_st500"],
+        "weights": [0.8, 0.2],
+        "eval_vocab": [None, SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_singing_v1": { # for singing-only task
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_stem", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.0125, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_singing_drum_v1": { # for singing-only and drum-only tasks
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_stem", "enstdrums_dtm",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.1, 0.05, 0.05, 0.0125, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross": { # including Mestro and URMP
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.5, 0.1, 0.125, 0.075, 0.025, 0.01, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_rebal": { # rebalanced for cross-augment, using spleeter
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.4, 0.15, 0.15, 0.075, 0.025, 0.01, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_rebal2": { # rebalanced for cross-augment, using spleeter
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.275, 0.19, 0.19, 0.1, 0.025, 0.02, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_rebal4": { # rebalanced for cross-augment, using spleeter
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.258, 0.19, 0.2, 0.125, 0.022, 0.005, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_rebal5": { # rebalanced for cross-augment, using spleeter
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.295, 0.19, 0.24, 0.05, 0.02, 0.005, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_stem": { # accomp stem for sub-task learning + rebalanced for cross-augment
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_stem", "enstdrums_dtm",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.4, 0.15, 0.15, 0.075, 0.025, 0.01, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_stem_rebal3": { # accomp stem for sub-task learning + rebalanced for cross-augment
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_stem", "enstdrums_dtm",
+            "guitarset_pshift", "egmd", "urmp", "maestro"
+        ],
+        "weights": [0.265, 0.18, 0.21, 0.1, 0.025, 0.02, 0.1, 0.1],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_v6": { # +cmeida +idmt_smt_bass
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset", "egmd", "urmp", "maestro", "idmt_smt_bass", "cmedia_voc",
+        ],
+        "weights": [0.295, 0.19, 0.19, 0.05, 0.01, 0.005, 0.1, 0.1, 0.01, 0.05],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None, BASS_SOLO_CLASS, SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_v6_geerdes": { # +geerdes_half
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset", "egmd", "urmp", "maestro", "idmt_smt_bass", "cmedia_voc",
+            "geerdes_half", "geerdes_half_sep"
+        ],
+        "weights": [0.295, 0.19, 0.19, 0.05, 0.01, 0.005, 0.075, 0.075, 0.01, 0.05, 0.025, 0.025],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None, BASS_SOLO_CLASS,
+            SINGING_SOLO_CLASS, GM_INSTR_CLASS_PLUS, GM_INSTR_CLASS_PLUS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_cross_v6_geerdes_rebal": { # +geerdes_half
+            "presets": [
+                "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+                "guitarset", "egmd", "urmp", "maestro", "idmt_smt_bass", "cmedia_voc",
+                "geerdes_half", "geerdes_half_sep"
+            ],
+            "weights": [0.245, 0.175, 0.19, 0.05, 0.01, 0.005, 0.075, 0.05, 0.01, 0.05, 0.075, 0.075],
+            "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None, BASS_SOLO_CLASS,
+                SINGING_SOLO_CLASS, GM_INSTR_EXT_CLASS_PLUS, GM_INSTR_EXT_CLASS_PLUS],  # None means instrument-agnostic F1 for each dataset
+            "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+            "val_max_num_files": 20,  # max 20 files per dataset
+            "test_max_num_files": None,
+        },
+   "all_cross_v7": {
+        "presets": [
+            "slakh", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression_pshift", "egmd", "urmp", "maestro", "idmt_smt_bass", "cmedia_voc",
+        ],
+        "weights": [0.295, 0.19, 0.191, 0.05, 0.01, 0.004, 0.1, 0.1, 0.01, 0.05],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None, BASS_SOLO_CLASS, SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+   "all_cross_final": {
+        "presets": [
+            "slakh_final", "musicnet_thickstun_em", "mir_st500_voc", "enstdrums_dtp",
+            "guitarset_progression_pshift", "egmd", "urmp", "maestro_final", "idmt_smt_bass", "cmedia_voc",
+        ],
+        "weights": [0.295, 0.19, 0.191, 0.05, 0.01, 0.004, 0.1, 0.1, 0.01, 0.05],
+        "eval_vocab": [None, None, SINGING_SOLO_CLASS, None, None, None, None, None, BASS_SOLO_CLASS, SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "all_eval_final": { # The final evaluation set
+        "presets": [
+            "slakh", "musicnet_thickstun", "musicnet_thickstun_em", "musicnet_thickstun_ext",
+            "musicnet_thickstun_ext_em", "mir_st500_voc", "mir_st500", "enstdrums_dtp",
+            "enstdrums_dtm", "guitarset_progression_pshift", "rwc_pop_bass", "maestro", "urmp",
+            "maps_default", "rwc_pop_full", # "geerdes", "geerdes_sep",
+        ],
+        "eval_vocab": [
+            GM_INSTR_CLASS, MUSICNET_INSTR_CLASS, MUSICNET_INSTR_CLASS, MUSICNET_INSTR_CLASS,
+            MUSICNET_INSTR_CLASS, SINGING_SOLO_CLASS, SINGING_SOLO_CLASS, None,
+            None, None, BASS_SOLO_CLASS, PIANO_SOLO_CLASS, GM_INSTR_CLASS,
+            PIANO_SOLO_CLASS, GM_INSTR_CLASS_PLUS, # GM_INSTR_CLASS_PLUS, GM_INSTR_CLASS_PLUS
+        ],
+        "eval_drum_vocab": drum_vocab_presets["ksh"],
+    },
+    "geerdes_eval": { # Geerdes evaluation sets for models trained without Geerdes.
+        "presets": ["geerdes_sep", "geerdes"],
+        "eval_vocab": [GM_INSTR_CLASS_PLUS, GM_INSTR_CLASS_PLUS],
+        "eval_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "geerdes_half_eval": { # Geerdes evaluation sets for models trained with Geerdes-half
+        "presets": ["geerdes_half_sep", "geerdes_half"],
+        "eval_vocab": [GM_INSTR_CLASS_PLUS, GM_INSTR_CLASS_PLUS],
+        "eval_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "minimal": { # slakh + mir_st500 with spleeter
+        "presets": ["slakh", "mir_st500_voc"],
+        "weights": [0.8, 0.2],
+        "eval_vocab": [None, SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+    "singing_debug": { # slakh + mir_st500 with spleeter
+        "presets": ["mir_st500_voc_debug"],
+        "weights": [1.0],
+        "eval_vocab": [SINGING_SOLO_CLASS],  # None means instrument-agnostic F1 for each dataset
+        "eval_drum_vocab": drum_vocab_presets["ksh"],  # for drums, kick-snare-hihat metric
+        "val_max_num_files": 20,  # max 20 files per dataset
+        "test_max_num_files": None,
+    },
+}

amt/src/config/task.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""task.py"""
+from config.vocabulary import *
+from utils.note_event_dataclasses import Event
+task_cfg = {
+    "mt3_midi": { # 11 classes + drum class
+        "name": "mt3_midi",
+        "train_program_vocab": program_vocab_presets["mt3_midi"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "mt3_midi_plus": { # 11 classes + singing + drum class
+        "name": "mt3_midi_plus",
+        "train_program_vocab": program_vocab_presets["mt3_midi_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "mt3_full": { # 34 classes (except drums) as in MT3 paper
+        "name": "mt3_full",
+        "train_program_vocab": program_vocab_presets["mt3_full"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "mt3_full_plus": { # 34 classes (except drums) as in MT3 paper + singing + drum class
+        "name": "mt3_full_plus",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "gm_ext_plus": { # 13 classes + singing + chorus (except drums)
+        "name": "gm_ext_plus",
+        "train_program_vocab": program_vocab_presets["gm_ext_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+    },
+    "singing_v1": {
+        "name": "singing",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "subtask_tokens": ["task", "transcribe_singing", "transcribe_all"],
+        "ignore_decoding_tokens": ["task", "transcribe_singing", "transcribe_all"],
+        "max_task_token_length": 2,
+        "eval_subtask_prefix": {
+            "default": [Event("transcribe_all", 0), Event("task", 0)],
+            "singing-only": [Event("transcribe_singing", 0),
+                             Event("task", 0)],
+        }
+    },
+    "singing_drum_v1": {
+        "name": "singing_drum",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "subtask_tokens": ["task", "transcribe_singing", "transcribe_drum", "transcribe_all"],
+        "ignore_decoding_tokens": [
+            "task", "transcribe_singing", "transcribe_drum", "transcribe_all"
+        ],
+        "max_task_token_length": 2,
+        "eval_subtask_prefix": {
+            "default": [Event("transcribe_all", 0), Event("task", 0)],
+            "singing-only": [Event("transcribe_singing", 0),
+                             Event("task", 0)],
+            "drum-only": [Event("transcribe_drum", 0),
+                          Event("task", 0)],
+        }
+    },
+    "mc13": { # multi-channel decoding task of {11 classes + drums + singing}
+        "name": "mc13",
+        "train_program_vocab": program_vocab_presets["gm_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "num_decoding_channels": len(program_vocab_presets["gm_plus"]) + 1, # 13
+        "max_note_token_length_per_ch": 512, # multi-channel decoding exclusive parameter
+        "mask_loss_strategy": None, # multi-channel decoding exclusive parameter
+    },
+    "mc13_256": { # multi-channel decoding task of {11 classes + drums + singing}
+        "name": "mc13_256",
+        "train_program_vocab": program_vocab_presets["gm_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "num_decoding_channels": len(program_vocab_presets["gm_plus"]) + 1, # 13
+        "max_note_token_length_per_ch": 256, # multi-channel decoding exclusive parameter
+        "mask_loss_strategy": None, # multi-channel decoding exclusive parameter
+    },
+    "mc13_full_plus": { # multi-channel decoding task of {34 classes + drums + singing & chorus} mapped to 13 channels
+        "name": "mc13_full_plus",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "program2channel_vocab_source": program_vocab_presets["gm_plus"],
+        "num_decoding_channels": 13,
+        "max_note_token_length_per_ch": 512, # multi-channel decoding exclusive parameter
+        "mask_loss_strategy": None, # multi-channel decoding exclusive parameter
+    },
+    "mc13_full_plus_256": { # multi-channel decoding task of {34 classes + drums + singing & chorus} mapped to 13 channels
+        "name": "mc13_full_plus_256",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "program2channel_vocab_source": program_vocab_presets["gm_plus"],
+        "num_decoding_channels": 13,
+        "max_note_token_length_per_ch": 256, # multi-channel decoding exclusive parameter
+        "mask_loss_strategy": None, # multi-channel decoding exclusive parameter
+    },
+    "exc_v1": {
+        "name": "exclusive",
+        "train_program_vocab": program_vocab_presets["mt3_full_plus"],
+        "train_drum_vocab": drum_vocab_presets["gm"],
+        "subtask_tokens": ["transcribe", "all", ":"],
+        # "ignore_decoding_tokens": [
+        #     "task", "transcribe_singing", "transcribe_drum", "transcribe_all"
+        # ],
+        # "max_task_token_length": 2,
+        "ignore_decoding_tokens_from_and_to": ["transcribe", ":"],
+        "eval_subtask_prefix": { # this is the main task that transcribe all instruments
+            "default": [Event("transcribe", 0), Event("all", 0), Event(":", 0)],
+        },
+        "shuffle_subtasks": True,
+    },
+}

amt/src/config/vocabulary.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""vocabulary.py
+Vocabulary for instrument classes. Vocabulary can be used as train_vocab
+or test_vocab in data_presets.py or train.py arguments.
+- When it is used as train_vocab, it maps the instrument classes to the first
+  program number of the class. For example, if you use 'GM_INSTR_CLASS' as
+  train_vocab, then the program number of 'Piano' is [0,1,2,3,4,5,6,7]. These
+  program numbers are trained as program [0] in the model.
+  - When it is used as eval_vocab, any program number in the instrument class
+  is considered as correct.
+MUSICNET_INSTR_CLASS: 3 classes used for MusicNet benchmark
+GM_INSTR_CLASS: equivalent to 'MIDI Class' defined by MT3.
+GM_INSTR_CLASS_PLUS: GM_INSTR_CLASS + singing voice
+GM_INSTR_FULL: 128 GM instruments, which is extended from 'MT3_FULL'
+MT3_FULL: this matches the class names in Table 3 of MT3 paper
+ENST_DRUM_NOTES: 20 drum notes used in ENST dataset
+GM_DRUM_NOTES: 45 GM drum notes with percussions
+Program 128 is reserved for 'drum' internally.
+Program 129 is reserved for 'unannotated', internally.
+Program 100 is reserved for 'singing voice (melody)' in GM_INSTR_CLASS_PLUS.
+Program 101 is reserved for 'singing voice (chorus)' in GM_INSTR_CLASS_PLUS.
+"""
+# yapf: disable
+import numpy as np
+PIANO_SOLO_CLASS = {
+    "Piano": np.arange(0, 8),
+}
+GUITAR_SOLO_CLASS = {
+    "Guitar": np.arange(24, 32),
+}
+SINGING_SOLO_CLASS = {
+    "Singing Voice": [100, 101],
+}
+SINGING_CHORUS_SEP_CLASS = {
+    "Singing Voice": [100],
+    "Singing Voice (chorus)": [101],
+}
+BASS_SOLO_CLASS = {
+    "Bass": np.arange(32, 40),
+}
+MUSICNET_INSTR_CLASS = {
+    "Piano": np.arange(0, 8),
+    "Strings": np.arange(40, 52),  # Solo strings + ensemble strings
+    "Winds": np.arange(64, 80),  # Reed + Pipe
+}
+GM_INSTR_CLASS = {
+    "Piano": np.arange(0, 8),
+    "Chromatic Percussion": np.arange(8, 16),
+    "Organ": np.arange(16, 24),
+    "Guitar": np.arange(24, 32),
+    "Bass": np.arange(32, 40),
+    "Strings": np.arange(40, 56),  # Strings + Ensemble
+    # "Strings": np.arange(40, 48),
+    # "Ensemble": np.arange(48, 56),
+    "Brass": np.arange(56, 64),
+    "Reed": np.arange(64, 72),
+    "Pipe": np.arange(72, 80),
+    "Synth Lead": np.arange(80, 88),
+    "Synth Pad": np.arange(88, 96),
+}
+GM_INSTR_CLASS_PLUS = GM_INSTR_CLASS.copy()
+GM_INSTR_CLASS_PLUS["Singing Voice"] = [100, 101]
+GM_INSTR_EXT_CLASS = { # Best for enjoyable MIDI file generation
+    "Acoustic Piano": [0, 1, 3, 6, 7],
+    "Electric Piano": [2, 4, 5],
+    "Chromatic Percussion": np.arange(8, 16),
+    "Organ": np.arange(16, 24),
+    "Guitar (clean)": np.arange(24, 28),
+    "Guitar (distortion)": [30, 28, 29, 31], # np.arange(28, 32),
+    "Bass": [33, 32, 34, 35, 36, 37, 38, 39], # np.arange(32, 40),
+    "Strings": [48, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55], # np.arange(40, 56),
+    "Brass": np.arange(56, 64),
+    "Reed": np.arange(64, 72),
+    "Pipe": np.arange(72, 80),
+    "Synth Lead": np.arange(80, 88),
+    "Synth Pad": np.arange(88, 96),
+}
+GM_INSTR_EXT_CLASS_PLUS = GM_INSTR_EXT_CLASS.copy()
+GM_INSTR_EXT_CLASS_PLUS["Singing Voice"] = [100]
+GM_INSTR_EXT_CLASS_PLUS["Singing Voice (chorus)"] = [101]
+GM_INSTR_FULL = {
+    "Acoustic Grand Piano": [0],
+    "Bright Acoustic Piano": [1],
+    "Electric Grand Piano": [2],
+    "Honky-tonk Piano": [3],
+    "Electric Piano 1": [4],
+    "Electric Piano 2": [5],
+    "Harpsichord": [6],
+    "Clavinet": [7],
+    "Celesta": [8],
+    "Glockenspiel": [9],
+    "Music Box": [10],
+    "Vibraphone": [11],
+    "Marimba": [12],
+    "Xylophone": [13],
+    "Tubular Bells": [14],
+    "Dulcimer": [15],
+    "Drawbar Organ": [16],
+    "Percussive Organ": [17],
+    "Rock Organ": [18],
+    "Church Organ": [19],
+    "Reed Organ": [20],
+    "Accordion": [21],
+    "Harmonica": [22],
+    "Tango Accordion": [23],
+    "Acoustic Guitar (nylon)": [24],
+    "Acoustic Guitar (steel)": [25],
+    "Electric Guitar (jazz)": [26],
+    "Electric Guitar (clean)": [27],
+    "Electric Guitar (muted)": [28],
+    "Overdriven Guitar": [29],
+    "Distortion Guitar": [30],
+    "Guitar Harmonics": [31],
+    "Acoustic Bass": [32],
+    "Electric Bass (finger)": [33],
+    "Electric Bass (pick)": [34],
+    "Fretless Bass": [35],
+    "Slap Bass 1": [36],
+    "Slap Bass 2": [37],
+    "Synth Bass 1": [38],
+    "Synth Bass 2": [39],
+    "Violin": [40],
+    "Viola": [41],
+    "Cello": [42],
+    "Contrabass": [43],
+    "Tremolo Strings": [44],
+    "Pizzicato Strings": [45],
+    "Orchestral Harp": [46],
+    "Timpani": [47],
+    "String Ensemble 1": [48],
+    "String Ensemble 2": [49],
+    "Synth Strings 1": [50],
+    "Synth Strings 2": [51],
+    "Choir Aahs": [52],
+    "Voice Oohs": [53],
+    "Synth Choir": [54],
+    "Orchestra Hit": [55],
+    "Trumpet": [56],
+    "Trombone": [57],
+    "Tuba": [58],
+    "Muted Trumpet": [59],
+    "French Horn": [60],
+    "Brass Section": [61],
+    "Synth Brass 1": [62],
+    "Synth Brass 2": [63],
+    "Soprano Sax": [64],
+    "Alto Sax": [65],
+    "Tenor Sax": [66],
+    "Baritone Sax": [67],
+    "Oboe": [68],
+    "English Horn": [69],
+    "Bassoon": [70],
+    "Clarinet": [71],
+    "Piccolo": [72],
+    "Flute": [73],
+    "Recorder": [74],
+    "Pan Flute": [75],
+    "Bottle Blow": [76],
+    "Shakuhachi": [77],
+    "Whistle": [78],
+    "Ocarina": [79],
+    "Lead 1 (square)": [80],
+    "Lead 2 (sawtooth)": [81],
+    "Lead 3 (calliope)": [82],
+    "Lead 4 (chiff)": [83],
+    "Lead 5 (charang)": [84],
+    "Lead 6 (voice)": [85],
+    "Lead 7 (fifths)": [86],
+    "Lead 8 (bass + lead)": [87],
+    "Pad 1 (new age)": [88],
+    "Pad 2 (warm)": [89],
+    "Pad 3 (polysynth)": [90],
+    "Pad 4 (choir)": [91],
+    "Pad 5 (bowed)": [92],
+    "Pad 6 (metallic)": [93],
+    "Pad 7 (halo)": [94],
+    "Pad 8 (sweep)": [95],
+    # "FX 1 (rain)": [96],
+    # "FX 2 (soundtrack)": [97],
+    # "FX 3 (crystal)": [98],
+    # "FX 4 (atmosphere)": [99],
+    # "FX 5 (brightness)": [100],
+    # "FX 6 (goblins)": [101],
+    # "FX 7 (echoes)": [102],
+    # "FX 8 (sci-fi)": [103],
+    # "Sitar": [104],
+    # "Banjo": [105],
+    # "Shamisen": [106],
+    # "Koto": [107],
+    # "Kalimba": [108],
+    # "Bagpipe": [109],
+    # "Fiddle": [110],
+    # "Shanai": [111],
+    # "Tinkle Bell": [112],
+    # "Agogo": [113],
+    # "Steel Drums": [114],
+    # "Woodblock": [115],
+    # "Taiko Drum": [116],
+    # "Melodic Tom": [117],
+    # "Synth Drum": [118],
+    # "Reverse Cymbal": [119],
+    # "Guitar Fret Noise": [120],
+    # "Breath Noise": [121],
+    # "Seashore": [122],
+    # "Bird Tweet": [123],
+    # "Telephone Ring": [124],
+    # "Helicopter": [125],
+    # "Applause": [126],
+    # "Gunshot": [127]
+}
+MT3_FULL = { # this matches the class names in Table 3 of MT3 paper
+    "Acoustic Piano": [0, 1, 3, 6, 7],
+    "Electric Piano": [2, 4, 5],
+    "Chromatic Percussion": np.arange(8, 16),
+    "Organ": np.arange(16, 24),
+    "Acoustic Guitar": np.arange(24, 26),
+    "Clean Electric Guitar": np.arange(26, 29),
+    "Distorted Electric Guitar": np.arange(29, 32),
+    "Acoustic Bass": [32, 35],
+    "Electric Bass": [33, 34, 36, 37, 38, 39],
+    "Violin": [40],
+    "Viola": [41],
+    "Cello": [42],
+    "Contrabass": [43],
+    "Orchestral Harp": [46],
+    "Timpani": [47],
+    "String Ensemble": [48, 49, 44, 45],
+    "Synth Strings": [50, 51],
+    "Choir and Voice": [52, 53, 54],
+    "Orchestra Hit": [55],
+    "Trumpet": [56, 59],
+    "Trombone": [57],
+    "Tuba": [58],
+    "French Horn": [60],
+    "Brass Section": [61, 62, 63],
+    "Soprano/Alto Sax": [64, 65],
+    "Tenor Sax": [66],
+    "Baritone Sax": [67],
+    "Oboe": [68],
+    "English Horn": [69],
+    "Bassoon": [70],
+    "Clarinet": [71],
+    "Pipe": [73, 72, 74, 75, 76, 77, 78, 79],
+    "Synth Lead": np.arange(80, 88),
+    "Synth Pad": np.arange(88, 96),
+}
+MT3_FULL_PLUS = MT3_FULL.copy()
+MT3_FULL_PLUS["Singing Voice"] = [100]
+MT3_FULL_PLUS["Singing Voice (chorus)"] = [101]
+ENST_DRUM_NOTES = {
+    "bd": [36],  # Kick Drum
+    "sd": [38],  # Snare Drum
+    "sweep": [0],  # Brush sweep
+    "sticks": [1],  # Sticks
+    "rs": [2],  # Rim shot
+    "cs": [37],  # X-stick
+    "chh": [42],  # Closed Hi-Hat
+    "ohh": [46],  # Open Hi-Hat
+    "cb": [56],  # Cowbell
+    "c": [3],  # Other Cymbals
+    "lmt": [47],  # Low Mid Tom
+    "mt": [48],  # Mid Tom
+    "mtr": [58],  # Mid Tom Rim
+    "lt": [45],  # Low Tom
+    "ltr": [50],  # Low Tom Rim
+    "lft": [41],  # Low Floor Tom
+    "rc": [51],  # Ride Cymbal
+    "ch": [52],  # Chinese Cymbal
+    "cr": [49],  # Crash Cymbal
+    "spl": [55],  # Splash Cymbal
+}
+EGMD_DRUM_NOTES = {
+    "Kick Drum": [36],  # Listed by order of most common annotation
+    "Snare X-stick": [37],  # Snare X-Stick, https://youtu.be/a2KFrrKaoYU?t=80
+    "Snare Drum": [38],  # Snare (head) and Electric Snare
+    "Closed Hi-Hat": [42, 44, 22],  # 44 is pedal hi-hat
+    "Open Hi-Hat": [46, 26],
+    "Cowbell": [56],
+    "High Floor Tom": [43],
+    "Low Floor Tom": [41],  # Lowest Tom
+    "Low Tom": [45],
+    "Low-Mid Tom": [47],
+    "Mid Tom": [48],
+    "Low Tom (Rim)": [50],  # TD-17: 47, 50, 58
+    "Mid Tom (Rim)": [58],
+    # "Ride Cymbal": [51, 53, 59],
+    "Ride": [51],
+    "Ride (Bell)": [53],  # https://youtu.be/b94hZoM5s3k?t=323
+    "Ride (Edge)": [59],
+    "Chinese Cymbal": [52],
+    "Crash Cymbal": [49, 57],
+    "Splash Cymbal": [55],
+}
+# Inspired by Roland TD-17 MIDI note map, https://rolandus.zendesk.com/hc/en-us/articles/360005173411-TD-17-Default-Factory-MIDI-Note-Map
+GM_DRUM_NOTES = {
+    "Kick Drum": [36, 35],  # Listed by order of most common annotation
+    "Snare X-stick": [37, 2],  # Snare X-Stick, https://youtu.be/a2KFrrKaoYU?t=80
+    "Snare Drum": [38, 40],  # Snare (head) and Electric Snare
+    "Closed Hi-Hat": [42, 44, 22],  # 44 is pedal hi-hat
+    "Open Hi-Hat": [46, 26],
+    "Cowbell": [56],
+    "High Floor Tom": [43],
+    "Low Floor Tom": [41],  # Lowest Tom
+    "Low Tom": [45],
+    "Low-Mid Tom": [47],
+    "Mid Tom": [48],
+    "Low Tom (Rim)": [50],  # TD-17: 47, 50, 58
+    "Mid Tom (Rim)": [58],
+    # "Ride Cymbal": [51, 53, 59],
+    "Ride": [51],
+    "Ride (Bell)": [53],  # https://youtu.be/b94hZoM5s3k?t=323
+    "Ride (Edge)": [59],
+    "Chinese Cymbal": [52],
+    "Crash Cymbal": [49, 57],
+    "Splash Cymbal": [55],
+}
+KICK_SNARE_HIHAT = {
+    "Kick Drum": [36, 35],
+    "Snare Drum": [38, 40],
+    # "Snare Drum + X-Stick": [38, 40, 37, 2],
+    # "Snare X-stick": [37, 2],  # Snare X-Stick, https://youtu.be/a2KFrrKaoYU?t=80
+    "Hi-Hat": [42, 44, 46, 22, 26],
+    # "Ride Cymbal": [51, 53, 59],
+    # "Hi-Hat + Ride": [42, 44, 46, 22, 26, 51, 53, 59],
+    # "HiHat + all Cymbals": [42, 44, 46, 22, 26, 51, 53, 59, 52, 49, 57, 55],
+    # "Kick Drum + Low Tom": [36, 35, 45],
+    # "All Cymbal": [51, 53, 59, 52, 49, 57, 55]
+    # "all": np.arange(30, 60)
+}
+drum_vocab_presets = {
+    "gm": GM_DRUM_NOTES,
+    "egmd": EGMD_DRUM_NOTES,
+    "enst": ENST_DRUM_NOTES,
+    "ksh": KICK_SNARE_HIHAT,
+    "kshr": {
+        "Kick Drum": [36, 35],
+        "Snare Drum": [38, 40],
+        "Hi-Hat": [42, 44, 46, 22, 26, 51, 53, 59],
+    }
+}
+program_vocab_presets = {
+    "gm_full": GM_INSTR_FULL,  # 96 classes (except drums)
+    "mt3_full": MT3_FULL,  # 34 classes (except drums) as in MT3 paper
+    "mt3_midi": GM_INSTR_CLASS,  # 11 classes (except drums) as in MT3 paper
+    "mt3_midi_plus": GM_INSTR_CLASS_PLUS,  # 11 classes + singing (except drums)
+    "mt3_full_plus": MT3_FULL_PLUS,  # 34 classes (except drums) mt3_full + singing (except drums)
+    "gm": GM_INSTR_CLASS,  # 11 classes (except drums)
+    "gm_plus": GM_INSTR_CLASS_PLUS,  # 11 classes + singing (except drums)
+    "gm_ext_plus": GM_INSTR_EXT_CLASS_PLUS,  # 13 classes + singing + chorus (except drums)
+}

amt/src/extras/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

amt/src/extras/Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
+LABEL maintainer="https://github.com/mimbres/YourMT3"
+ENV TZ=Europe/London \
+    DEBIAN_FRONTEND=noninteractive
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+RUN apt-get update
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+RUN apt-get update --fix-missing && apt-get install -y wget curl \
+    nano git ffmpeg sox tmux htop
+RUN pip3 install --upgrade pip
+RUN pip3 install mirdata mido git+https://github.com/craffel/mir_eval.git \
+    matplotlib lightning>=2.0.2 pytest-timeout pytest deprecated librosa \
+    einops transformers wandb
+CMD [ "/bin/bash" ]

amt/src/extras/check_drum_channel_slakh.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from utils.mirdata_dev.datasets import slakh16k
+def check_drum_channel_slakh(data_home: str):
+    ds = slakh16k.Dataset(data_home, version='default')
+    for track_id in ds.track_ids:
+        is_drum = ds.track(track_id).is_drum
+        midi = MidiFile(ds.track(track_id).midi_path)
+        cnt = 0
+        for msg in midi:
+            if 'note' in msg.type:
+                if is_drum and (msg.channel != 9):
+                    print('found drum track with channel != 9 in track_id: ',
+                          track_id)
+                if not is_drum and (msg.channel == 9):
+                    print(
+                        'found non-drum track with channel == 9 in track_id: ',
+                        track_id)
+                if is_drum and (msg.channel == 9):
+                    cnt += 1
+        if cnt > 0:
+            print(f'found {cnt} notes in drum track with ch 9 in track_id: ',
+                  track_id)
+    return

amt/src/extras/dataset_mutable_var_sanity_check.py ADDED Viewed

	@@ -0,0 +1,81 @@

+for n in range(1000):
+    sampled_data = ds.__getitem__(n)
+    a = deepcopy(sampled_data['note_event_segments'])
+    b = deepcopy(sampled_data['note_event_segments'])
+    for (note_events, tie_note_events, start_time) in list(zip(*b.values())):
+        note_events = pitch_shift_note_events(note_events, 2)
+        tie_note_events = pitch_shift_note_events(tie_note_events, 2)
+    # compare
+    for i, (note_events, tie_note_events, start_time) in enumerate(list(zip(*b.values()))):
+        for j, ne in enumerate(note_events):
+            if ne.is_drum is False:
+                if ne.pitch != a['note_events'][i][j].pitch + 2:
+                    print(i, j)
+                assert ne.pitch == a['note_events'][i][j].pitch + 2
+        for k, tne in enumerate(tie_note_events):
+            assert tne.pitch == a['tie_note_events'][i][k].pitch + 2
+    print('test {} passed'.format(n))
+def assert_note_events_almost_equal(actual_note_events,
+                                    predicted_note_events,
+                                    ignore_time=False,
+                                    ignore_activity=True,
+                                    delta=5.1e-3):
+    """
+    Asserts that the given lists of Note instances are equal up to a small
+    floating-point tolerance, similar to `assertAlmostEqual` of `unittest`.
+    Tolerance is 5e-3 by default, which is 5 ms for 100 ticks-per-second.
+    If `ignore_time` is True, then the time field is ignored. (useful for
+    comparing tie note events, default is False)
+    If `ignore_activity` is True, then the activity field is ignored (default
+    is True).
+    """
+    assert len(actual_note_events) == len(predicted_note_events)
+    for j, (actual_note_event,
+            predicted_note_event) in enumerate(zip(actual_note_events, predicted_note_events)):
+        if ignore_time is False:
+            assert abs(actual_note_event.time - predicted_note_event.time) <= delta
+        assert actual_note_event.is_drum == predicted_note_event.is_drum
+        if actual_note_event.is_drum is False and predicted_note_event.is_drum is False:
+            assert actual_note_event.program == predicted_note_event.program
+        assert actual_note_event.pitch == predicted_note_event.pitch
+        assert actual_note_event.velocity == predicted_note_event.velocity
+        if ignore_activity is False:
+            assert actual_note_event.activity == predicted_note_event.activity
+cache_old = deepcopy(dict(ds.cache))
+for n in range(500):
+    sampled_data = ds.__getitem__(n)
+    cache_new = ds.cache
+    cnt = 0
+    for k, v in cache_new.items():
+        if k in cache_old:
+            cnt += 1
+            assert (cache_new[k]['programs'] == cache_old[k]['programs']).all()
+            assert (cache_new[k]['is_drum'] == cache_old[k]['is_drum']).all()
+            assert (cache_new[k]['has_stems'] == cache_old[k]['has_stems'])
+            assert (cache_new[k]['has_unannotated'] == cache_old[k]['has_unannotated'])
+            assert (cache_new[k]['audio_array'] == cache_old[k]['audio_array']).all()
+            for nes_new, nes_old in zip(cache_new[k]['note_event_segments']['note_events'],
+                                        cache_old[k]['note_event_segments']['note_events']):
+                assert_note_events_almost_equal(nes_new, nes_old)
+            for tnes_new, tnes_old in zip(cache_new[k]['note_event_segments']['tie_note_events'],
+                                          cache_old[k]['note_event_segments']['tie_note_events']):
+                assert_note_events_almost_equal(tnes_new, tnes_old, ignore_time=True)
+            for s_new, s_old in zip(cache_new[k]['note_event_segments']['start_times'],
+                                    cache_old[k]['note_event_segments']['start_times']):
+                assert s_new == s_old
+    cache_old = deepcopy(dict(ds.cache))
+    print(n, cnt)

amt/src/extras/datasets_eval_testing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from utils.datasets_eval import AudioFileDataset
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+def test():
+    ds = AudioFileDataset()
+    dl = DataLoader(
+        ds, batch_size=None, collate_fn=lambda k: k
+    )  # empty collate_fn is required to use mixed types.
+    for x, y in dl:
+        break
+    class MyModel(pl.LightningModule):
+        def __init__(self, **kwargs):
+            super().__init__()
+        def forward(self, x):
+            return x
+        def training_step(self, batch, batch_idx):
+            return 0
+        def validation_step(self, batch, batch_idx):
+            print(batch)
+            return 0
+        def train_dataloader(self):
+            return dl
+        def val_dataloader(self):
+            return dl
+        def configure_optimizers(self):
+            return None
+    model = MyModel()
+    trainer = pl.Trainer()
+    trainer.validate(model)

amt/src/extras/demo_cross_augmentation.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+from typing import Dict, Tuple
+from copy import deepcopy
+import soundfile as sf
+import torch
+from utils.data_modules import AMTDataModule
+from config.data_presets import data_preset_single_cfg, data_preset_multi_cfg
+from utils.augment import intra_stem_augment_processor
+def get_ds(data_preset_multi: Dict, train_num_samples_per_epoch: int = 90000):
+    dm = AMTDataModule(data_preset_multi=data_preset_multi, train_num_samples_per_epoch=train_num_samples_per_epoch)
+    dm.setup('fit')
+    dl = dm.train_dataloader()
+    ds = dl.flattened[0].dataset
+    return ds
+def debug_func(num_segments: int = 10):
+    sampled_data, sampled_ids = ds._get_rand_segments_from_cache(num_segments)
+    ux_sampled_data, _ = ds._get_rand_segments_from_cache(ux_count_sum, False, sampled_ids)
+    s = deepcopy(sampled_data)
+    intra_stem_augment_processor(sampled_data, submix_audio=False)
+def gen_audio(index: int = 0):
+    # audio_arr: (b, 1, nframe), note_token_arr: (b, l), task_token_arr: (b, task_l)
+    audio_arr, note_token_arr, task_token_arr = ds.__getitem__(index)
+    # merge all the segments into one audio file
+    audio = audio_arr.permute(0, 2, 1).reshape(-1).squeeze().numpy()
+    # save the audio file
+    sf.write('xaug_demo_audio.wav', audio, 16000, subtype='PCM_16')
+data_preset_multi = data_preset_multi_cfg["all_cross_rebal5"]
+ds = get_ds(data_preset_multi)
+ds.random_amp_range = [0.8, 1.1]
+ds.stem_xaug_policy = {
+    "max_k": 5,
+    "tau": 0.3,
+    "alpha": 1.0,
+    "max_subunit_stems": 12,
+    "no_instr_overlap": True,
+    "no_drum_overlap": True,
+    "uhat_intra_stem_augment": True,
+}
+gen_audio(3)
+# for k in ds.cache.keys():
+#     arr = ds.cache[k]['audio_array']
+#     arr = np.sum(arr, axis=1).reshape(-1)
+#     # sf.write(f'xxx/{k}.wav', arr, 16000, subtype='PCM_16')
+#     if np.min(arr) > -0.5:
+#         print(k)
+# arr = ds.cache[52]['audio_array']
+# for i in range(arr.shape[1]):
+#     a = arr[:, i, :].reshape(-1)
+#     sf.write(f'xxx52/52_{i}.wav', a, 16000, subtype='PCM_16')

amt/src/extras/demo_intra_augmentation.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+import numpy as np
+import torch
+import json
+import soundfile as sf
+from utils.datasets_train import get_cache_data_loader
+def get_filelist(track_id: int) -> dict:
+    filelist = '../../data/yourmt3_indexes/slakh_train_file_list.json'
+    with open(filelist, 'r') as f:
+        fl = json.load(f)
+    new_filelist = dict()
+    for key, value in fl.items():
+        if int(key) == track_id:
+            new_filelist[0] = value
+    return new_filelist
+def get_ds(track_id: int, random_amp_range: list = [1., 1.], stem_aug_prob: float = 0.8):
+    filelist = get_filelist(track_id)
+    dl = get_cache_data_loader(filelist,
+                               'train',
+                               1,
+                               1,
+                               random_amp_range=random_amp_range,
+                               stem_aug_prob=stem_aug_prob,
+                               shuffle=False)
+    ds = dl.dataset
+    return ds
+def gen_audio(track_id: int, n_segments: int = 30, random_amp_range: list = [1., 1.], stem_aug_prob: float = 0.8):
+    ds = get_ds(track_id, random_amp_range, stem_aug_prob)
+    audio = []
+    for i in range(n_segments):
+        audio.append(ds.__getitem__(0)[0])
+        # audio.append(ds.__getitem__(i)[0])
+    audio = torch.concat(audio, dim=2).numpy()[0, 0, :]
+    sf.write('audio.wav', audio, 16000, subtype='PCM_16')
+gen_audio(1, 20)

amt/src/extras/download_mirst500.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import json
+import numpy as np
+from pytube import YouTube
+def downloadMp3(yt, idx, askPath=0):
+    # extract only audio
+    video = yt.streams.filter(only_audio=True).first()
+    destination = 'mp3File'
+    # check for destination to save file
+    if (askPath == 1):
+        print("Enter the destination (leave blank for default dir mp3File)")
+        destination = str(input(">> ")) or 'mp3File'
+    # download the file
+    out_file = video.download(output_path=destination)
+    # save the file
+    # base, ext = os.path.splitext(out_file)
+    dir_path, file_base = os.path.split(out_file)
+    new_file = os.path.join(dir_path, f'{idx}.mp3')
+    os.rename(out_file, new_file)
+    # result of success
+    print(yt.title + " has been successfully downloaded.")
+MISSING_FILE_IDS = [
+    16, 26, 33, 38, 40, 50, 53, 55, 60, 81, 82, 98, 107, 122, 126, 127, 129, 141, 145, 150, 172,
+    201, 205, 206, 215, 216, 221, 226, 232, 240, 243, 245, 255, 257, 267, 273, 278, 279, 285, 287,
+    291, 304, 312, 319, 321, 325, 329, 332, 333, 336, 337, 342, 359, 375, 402, 417, 438, 445, 454,
+    498
+]
+data_link_file = '../../../data/mir_St500_yourmt3_16k/MIR-ST500_20210206/MIR-ST500_link.json'
+data_link = json.load(open(data_link_file, 'r'))
+download_fail = []
+for i in MISSING_FILE_IDS:
+    print(f'Downloading {i}...')
+    yt = YouTube(data_link[str(i)])
+    try:
+        downloadMp3(yt, idx=i)
+    except:
+        download_fail.append(i)
+        print(f'Failed to download {i}.')
+print(f'Failed to download {len(download_fail)} files: {download_fail}')

amt/src/extras/fig/label_smooth_interval_of_interest.png ADDED Viewed

amt/src/extras/fig/pitchshift_benchnmark.png ADDED Viewed

amt/src/extras/fig/pitchshift_stretch_and_resampler_process_time.png ADDED Viewed

amt/src/extras/inspecting_slakh_bass.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import mirdata
+from utils.mirdata_dev.datasets import slakh16k
+ds = slakh16k.Dataset(data_home='../../data', version='2100-yourmt3-16k')
+mtrack_ids = ds.mtrack_ids
+# Collect plugin names
+plugin_names = set()
+cnt = 0
+for mtrack_id in mtrack_ids:
+    mtrack = ds.multitrack(mtrack_id)
+    for track_id in mtrack.track_ids:
+        track = ds.track(track_id)
+        if track.instrument.lower() == 'bass':
+            if track.plugin_name == 'upright_bass.nkm':
+                print(f'{str(cnt)}: {track_id}: {track.plugin_name}')
+            # if track.plugin_name not in plugin_names:
+            #     plugin_names.add(track.plugin_name)
+            #     print(f'{str(cnt)}: {track_id}: {track.plugin_name}')
+            #     cnt += 1
+"""
+0: Track00001-S03: scarbee_rickenbacker_bass_palm_muted.nkm
+1: Track00002-S01: classic_bass.nkm
+2: Track00004-S01: scarbee_rickenbacker_bass.nkm
+3: Track00005-S04: scarbee_jay_bass_both.nkm
+4: Track00006-S03: pop_bass.nkm
+5: Track00008-S00: scarbee_pre_bass.nkm
+6: Track00013-S00: jazz_upright.nkm
+7: Track00014-S01: funk_bass.nkm
+8: Track00016-S01: scarbee_mm_bass.nkm
+9: Track00024-S07: upright_bass.nkm
+10: Track00027-S03: scarbee_jay_bass_slap_both.nkm
+11: Track00094-S08: upright_bass2.nkm
+"""

amt/src/extras/install_deepspeed.md ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+# not required on pytorch 2.0:latest container
+pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64
+apt-get update
+apt-get install git
+apt-get install libaio-dev
+DS_BUILD_OPS=1 pip install deepspeed
+ds_report
+pip install deepspeed==0.7.7
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+In case you have trouble building apex from source we recommend using the NGC containers
+ from here which come with a pre-built PyTorch and apex release.
+nvcr.io/nvidia/pytorch:23.01-py3
+pip install deepspeed, pip install transformers[deepspeed]
+https://www.deepspeed.ai/docs/config-json/#autotuning
+"""

amt/src/extras/label_smoothing.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+a = torch.signal.windows.gaussian(11, sym=True, std=3)
+plt.plot(a)
+def gaussian_smoothing(y_hot, mu=5, sigma=0.865):
+    """
+    y_hot: one-hot encoded array
+    """
+    #sigma = np.sqrt(np.abs(np.log(0.05) / ((4 - mu)**2))) / 2
+    # Generate index array
+    i = np.arange(len(y_hot))
+    # Gaussian function
+    y_smooth = np.exp(-(i - mu)**2 / (2 * sigma**2))
+    # Normalize the resulting array
+    y_smooth /= y_smooth.sum()
+    return y_smooth, sigma
+# y_ls = (1 - α) * y_hot + α / K, where K is the number of classes, alpha is the smoothing parameter
+y_hot = torch.zeros(11)
+y_hot[5] = 1
+plt.plot(y_hot, 'b.-')
+alpha = 0.3
+y_ls = (1 - alpha) * y_hot + alpha / 10
+plt.plot(y_ls, 'r.-')
+y_gs, std = gaussian_smoothing(y_hot, A=0.5)
+plt.plot(y_gs, 'g.-')
+y_gst_a, std = gaussian_smoothing(y_hot, A=0.5, mu=5.5)
+plt.plot(y_gst_a, 'y.-')
+y_gst_b, std = gaussian_smoothing(y_hot, A=0.5, mu=5.8)
+plt.plot(y_gst_b, 'c.-')
+plt.legend([
+    'y_hot', 'label smoothing' + '\n' + '(alpha=0.3)',
+    'gaussian smoothing' + '\n' + 'for interval of interest' + '\n' + 'mu=5',
+    'gaussian smoothing' + '\n' + 'mu=5.5', 'gaussian smoothing' + '\n' + 'mu=5.8'
+])
+plt.grid()
+plt.xticks(np.arange(11), np.arange(0, 110, 10))
+plt.xlabel('''Time (ms)
+original (quantized) one hot label:
+[0,0,0,0,0,1,0,0,0,0,0]
+\n
+label smooting is defined as:
+ y_ls = (1 - α) * y_hot + α / K,
+where K is the number of classes, α is the smoothing parameter
+\n
+gaussian smoothing for the interval (± 10ms) of interest:
+y_gs = A * exp(-(i - mu)**2 / (2 * sigma**2))
+with sigma = 0.865 an mu = 5
+\n
+gaussian smoothing with unqunatized target timing:
+mu = 5.5 for 55ms target timing
+''')

amt/src/extras/multi_channel_seqlen_stats.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+from typing import Dict, Tuple
+from copy import deepcopy
+from collections import Counter
+import numpy as np
+import torch
+from utils.data_modules import AMTDataModule
+from utils.task_manager import TaskManager
+from config.data_presets import data_preset_single_cfg, data_preset_multi_cfg
+from utils.augment import intra_stem_augment_processor
+def get_ds(data_preset_multi: Dict, task_name: str, train_num_samples_per_epoch: int = 90000):
+    tm = TaskManager(task_name=task_name)
+    tm.max_note_token_length_per_ch = 1024  # only to check the max length
+    dm = AMTDataModule(data_preset_multi=data_preset_multi,
+                       task_manager=tm,
+                       train_num_samples_per_epoch=train_num_samples_per_epoch)
+    dm.setup('fit')
+    dl = dm.train_dataloader()
+    ds = dl.flattened[0].dataset
+    return ds
+data_preset_multi = data_preset_multi_cfg["all_cross_v6"]
+task_name = "mc13"  # "mt3_full_plus"
+ds = get_ds(data_preset_multi, task_name=task_name)
+ds.random_amp_range = [0.8, 1.1]
+ds.stem_xaug_policy = {
+    "max_k": 5,
+    "tau": 0.3,
+    "alpha": 1.0,
+    "max_subunit_stems": 12,
+    "no_instr_overlap": True,
+    "no_drum_overlap": True,
+    "uhat_intra_stem_augment": True,
+}
+length_all = []
+for i in range(40000):
+    if i % 5000 == 0:
+        print(i)
+    audio_arr, note_token_arr, task_totken_arr, pshift_steps = ds.__getitem__(i)
+    lengths = torch.sum(note_token_arr != 0, dim=2).flatten().cpu().tolist()
+    length_all.extend(lengths)
+length_all = np.asarray(length_all)
+# stats
+empty_sequence = np.sum(length_all < 3) / len(length_all) * 100
+print("empty_sequences:", f"{empty_sequence:.2f}", "%")
+mean_except_empty = np.mean(length_all[length_all > 2])
+print("mean_except_empty:", mean_except_empty)
+median_except_empty = np.median(length_all[length_all > 2])
+print("median_except_empty:", median_except_empty)
+ch_less_than_768 = np.sum(length_all < 768) / len(length_all) * 100
+print("ch_less_than_768:", f"{ch_less_than_768:.2f}", "%")
+ch_larger_than_512 = np.sum(length_all > 512) / len(length_all) * 100
+print("ch_larger_than_512:", f"{ch_larger_than_512:.6f}", "%")
+ch_larger_than_256 = np.sum(length_all > 256) / len(length_all) * 100
+print("ch_larger_than_256:", f"{ch_larger_than_256:.6f}", "%")
+ch_larger_than_128 = np.sum(length_all > 128) / len(length_all) * 100
+print("ch_larger_than_128:", f"{ch_larger_than_128:.6f}", "%")
+ch_larger_than_64 = np.sum(length_all > 64) / len(length_all) * 100
+print("ch_larger_than_64:", f"{ch_larger_than_64:.6f}", "%")
+song_length_all = length_all.reshape(-1, 13)
+song_larger_than_512 = 0
+song_larger_than_256 = 0
+song_larger_than_128 = 0
+song_larger_than_64 = 0
+for l in song_length_all:
+    if np.sum(l > 512) > 0:
+        song_larger_than_512 += 1
+    if np.sum(l > 256) > 0:
+        song_larger_than_256 += 1
+    if np.sum(l > 128) > 0:
+        song_larger_than_128 += 1
+    if np.sum(l > 64) > 0:
+        song_larger_than_64 += 1
+num_songs = len(song_length_all)
+print("song_larger_than_512:", f"{song_larger_than_512/num_songs*100:.4f}", "%")
+print("song_larger_than_256:", f"{song_larger_than_256/num_songs*100:.4f}", "%")
+print("song_larger_than_128:", f"{song_larger_than_128/num_songs*100:.4f}", "%")
+print("song_larger_than_64:", f"{song_larger_than_64/num_songs*100:.4f}", "%")
+instr_dict = {
+    0: "Piano",
+    1: "Chromatic Percussion",
+    2: "Organ",
+    3: "Guitar",
+    4: "Bass",
+    5: "Strings + Ensemble",
+    6: "Brass",
+    7: "Reed",
+    8: "Pipe",
+    9: "Synth Lead",
+    10: "Synth Pad",
+    11: "Singing",
+    12: "Drums",
+}
+cnt_larger_than_512 = Counter()
+for i in np.where(length_all > 512)[0] % 13:
+    cnt_larger_than_512[i] += 1
+print("larger_than_512:")
+for k, v in cnt_larger_than_512.items():
+    print(f"    - {instr_dict[k]}: {v}")
+cnt_larger_than_256 = Counter()
+for i in np.where(length_all > 256)[0] % 13:
+    cnt_larger_than_256[i] += 1
+print("larger_than_256:")
+for k, v in cnt_larger_than_256.items():
+    print(f"    - {instr_dict[k]}: {v}")
+cnt_larger_than_128 = Counter()
+for i in np.where(length_all > 128)[0] % 13:
+    cnt_larger_than_128[i] += 1
+print("larger_than_128:")
+for k, v in cnt_larger_than_128.items():
+    print(f"    - {instr_dict[k]}: {v}")
+"""
+empty_sequences: 91.06 %
+mean_except_empty: 36.68976799156269
+median_except_empty: 31.0
+ch_less_than_768: 100.00 %
+ch_larger_than_512: 0.000158 %
+ch_larger_than_256: 0.015132 %
+ch_larger_than_128: 0.192061 %
+ch_larger_than_64: 0.661260 %
+song_larger_than_512: 0.0021 %
+song_larger_than_256: 0.1926 %
+song_larger_than_128: 2.2280 %
+song_larger_than_64: 6.1033 %
+larger_than_512:
+    - Guitar: 7
+    - Strings + Ensemble: 3
+larger_than_256:
+    - Piano: 177
+    - Guitar: 680
+    - Strings + Ensemble: 79
+    - Organ: 2
+    - Chromatic Percussion: 11
+    - Bass: 1
+    - Synth Lead: 2
+    - Brass: 1
+    - Reed: 5
+larger_than_128:
+    - Guitar: 4711
+    - Strings + Ensemble: 1280
+    - Piano: 5548
+    - Bass: 211
+    - Synth Pad: 22
+    - Pipe: 18
+    - Chromatic Percussion: 55
+    - Synth Lead: 22
+    - Organ: 75
+    - Reed: 161
+    - Brass: 45
+    - Drums: 11
+"""

amt/src/extras/npy_speed_benchmark.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+from tasks.utils.event_codec import Event, EventRange
+from tasks.utils import event_codec
+ec = event_codec.Codec(
+            max_shift_steps=1000,  # this means 0,1,...,1000
+            steps_per_second=100,
+            event_ranges=[
+                EventRange('pitch', min_value=0, max_value=127),
+                EventRange('velocity', min_value=0, max_value=1),
+                EventRange('tie', min_value=0, max_value=0),
+                EventRange('program', min_value=0, max_value=127),
+                EventRange('drum', min_value=0, max_value=127),
+            ],
+        )
+events = [
+    Event(type='shift', value=0),  # actually not needed
+    Event(type='shift', value=1),  # 10 ms shift
+    Event(type='shift', value=1000),  # 10 s shift
+    Event(type='pitch', value=0),  # lowest pitch 8.18 Hz
+    Event(type='pitch', value=60),  # C4 or 261.63 Hz
+    Event(type='pitch', value=127),  # highest pitch G9 or 12543.85 Hz
+    Event(type='velocity', value=0),  # lowest velocity)
+    Event(type='velocity', value=1),  # lowest velocity)
+    Event(type='tie', value=0),  # tie
+    Event(type='program', value=0),  # program
+    Event(type='program', value=127),  # program
+    Event(type='drum', value=0),  # drum
+    Event(type='drum', value=127),  # drum
+]
+events = events * 100
+tokens = [ec.encode_event(e) for e in events]
+tokens = np.array(tokens, dtype=np.int16)
+import csv
+# Save events to a CSV file
+with open('events.csv', 'w', newline='') as file:
+    writer = csv.writer(file)
+    for event in events:
+        writer.writerow([event.type, event.value])
+# Load events from a CSV file
+with open('events.csv', 'r') as file:
+    reader = csv.reader(file)
+    events2 = [Event(row[0], int(row[1])) for row in reader]
+import json
+# Save events to a JSON file
+with open('events.json', 'w') as file:
+    json.dump([event.__dict__ for event in events], file)
+# Load events from a JSON file
+with open('events.json', 'r') as file:
+    events = [Event(**event_dict) for event_dict in json.load(file)]
+"""----------------------------"""
+# Write the tokens to a npy file
+import numpy as np
+np.save('tokens.npy', tokens)
+def t_npy():
+    t = np.load('tokens.npy', allow_pickle=True) # allow pickle doesn't affect speed
+os.makedirs('temp', exist_ok=True)
+for i in range(2400):
+    np.save(f'temp/tokens{i}.npy', tokens)
+def t_npy2400():
+    for i in range(2400):
+        t = np.load(f'temp/tokens{i}.npy')
+def t_npy2400_take200():
+    for i in range(200):
+        t = np.load(f'temp/tokens{i}.npy')
+import shutil
+shutil.rmtree('temp', ignore_errors=True)
+# Write the 2400 tokens to a single npy file
+data = dict()
+for i in range(2400):
+    data[f'arr{i}'] = tokens.copy()
+np.save(f'tokens_2400x.npy', data)
+def t_npy2400single():
+    t = np.load('tokens_2400x.npy', allow_pickle=True).item()
+def t_mmap2400single():
+    t = np.load('tokens_2400x.npy', mmap_mode='r')
+# Write the tokens to a npz file
+np.savez('tokens.npz', arr0=tokens)
+def t_npz():
+    npz_file = np.load('tokens.npz')
+    tt = npz_file['arr0']
+data = dict()
+for i in range(2400):
+    data[f'arr{i}'] = tokens
+np.savez('tokens.npz', **data )
+def t_npz2400():
+    npz_file = np.load('tokens.npz')
+    for i in range(2400):
+        tt = npz_file[f'arr{i}']
+def t_npz2400_take200():
+    npz_file = np.load('tokens.npz')
+    # npz_file.files
+    for i in range(200):
+        tt = npz_file[f'arr{i}']
+# Write the tokens to a txt file
+with open('tokens.txt', 'w') as file:
+    file.write(' '.join(map(str, tokens)))
+def t_txt():
+    # Read the tokens from the file
+    with open('tokens.txt', 'r') as file:
+        t = list(map(int, file.read().split()))
+    t = np.array(t)
+# Write the tokens to a CSV file
+with open('tokens.csv', 'w', newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(tokens)
+def t_csv():
+    # Read the tokens from the CSV file
+    with open('tokens.csv', 'r') as file:
+        reader = csv.reader(file)
+        t = list(map(int, next(reader)))
+        t = np.array(t)
+# Write the tokens to a JSON file
+with open('tokens.json', 'w') as file:
+    json.dump(tokens, file)
+def t_json():
+    # Read the tokens from the JSON file
+    with open('tokens.json', 'r') as file:
+        t = json.load(file)
+        t = np.array(t)
+with open('tokens_2400x.json', 'w') as file:
+    json.dump(data, file)
+def t_json2400single():
+    # Read the tokens from the JSON file
+    with open('tokens_2400x.json', 'r') as file:
+        t = json.load(file)
+def t_mmap():
+    t = np.load('tokens.npy', mmap_mode='r')
+# Write the tokens to bytes file
+np.savetxt('tokens.ntxt', tokens)
+def t_ntxt():
+    t = np.loadtxt('tokens.ntxt').astype(np.int32)
+%timeit t_npz() # 139 us
+%timeit t_mmap() # 3.12 ms
+%timeit t_npy() # 87.8 us
+%timeit t_txt() # 109 152 us
+%timeit t_csv() # 145 190 us
+%timeit t_json() # 72.8 119 us
+%timeit t_ntxt() # 878 us
+%timeit t_npy2400() # 212 ms; 2400 files in a folder
+%timeit t_npz2400() # 296 ms; uncompreesed 1000 arrays in a single file
+%timeit t_npy2400_take200() # 17.4 ms; 25 Mb
+%timeit t_npz2400_take200() # 28.8 ms; 3.72 ms for 10 arrays; 25 Mb
+%timeit t_npy2400single() # 4 ms; frozen dictionary containing 2400 arrays; 6.4 Mb; int16
+%timeit t_mmap2400single() # dictionary is not supported
+%timeit t_json2400single() # 175 ms; 17 Mb
+# 2400 files from 100ms hop for 4 minutes

amt/src/extras/perceivertf_inspect.py ADDED Viewed

	@@ -0,0 +1,640 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+def l2_normalize(matrix):
+    """
+    L2 Normalize the matrix along its rows.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+    Returns:
+        numpy.ndarray: The L2 normalized matrix.
+    """
+    l2_norms = np.linalg.norm(matrix, axis=1, keepdims=True)
+    normalized_matrix = matrix / l2_norms
+    return normalized_matrix
+def z_normalize(matrix):
+    """
+    Z-normalize the matrix along its rows (mean=0 and std=1).
+    Z-normalization is also known as "standardization", and derives from z-score.
+    Z = (X - mean) / std
+    Z-nomarlized, each row has mean=0 and std=1.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+    Returns:
+        numpy.ndarray: The Z normalized matrix.
+    """
+    mean = np.mean(matrix, axis=1, keepdims=True)
+    std = np.std(matrix, axis=1, keepdims=True)
+    normalized_matrix = (matrix - mean) / std
+    return normalized_matrix
+def l2_normalize_tensors(tensor_tuple):
+    """
+    Applies L2 normalization on the last two dimensions for each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors, each of shape (1, k, 30, 30).
+    Returns:
+        tuple of torch.Tensor: A tuple containing N L2-normalized tensors.
+    """
+    normalized_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Calculate L2 norm on the last two dimensions, keeping the dimensions using keepdim=True
+        l2_norm = torch.linalg.norm(tensor, dim=(-2, -1), keepdim=True)
+        # Apply L2 normalization
+        normalized_tensor = tensor / (
+            l2_norm + 1e-7)  # Small value to avoid division by zero
+        normalized_tensors.append(normalized_tensor)
+    return tuple(normalized_tensors)
+def z_normalize_tensors(tensor_tuple):
+    """
+    Applies Z-normalization on the last two dimensions for each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors, each of shape (1, k, 30, 30).
+    Returns:
+        tuple of torch.Tensor: A tuple containing N Z-normalized tensors.
+    """
+    normalized_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Calculate mean and std on the last two dimensions
+        mean = tensor.mean(dim=(-2, -1), keepdim=True)
+        std = tensor.std(dim=(-2, -1), keepdim=True)
+        # Apply Z-normalization
+        normalized_tensor = (tensor - mean) / (
+            std + 1e-7)  # Small value to avoid division by zero
+        normalized_tensors.append(normalized_tensor)
+    return tuple(normalized_tensors)
+def apply_temperature_to_attention_tensors(tensor_tuple, temperature=1.0):
+    """
+    Applies temperature scaling to the attention weights in each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors,
+                                             each of shape (1, k, 30, 30).
+        temperature (float): Temperature parameter to control the sharpness
+                             of the attention weights. Default is 1.0.
+    Returns:
+        tuple of torch.Tensor: A tuple containing N tensors with scaled attention weights.
+    """
+    scaled_attention_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Flatten the last two dimensions
+        flattened_tensor = tensor.reshape(1, tensor.shape[1],
+                                          -1)  # Modified line here
+        # Apply temperature scaling and softmax along the last dimension
+        scaled_attention = flattened_tensor / temperature
+        scaled_attention = F.softmax(scaled_attention, dim=-1)
+        # Reshape to original shape
+        scaled_attention = scaled_attention.view_as(tensor)
+        scaled_attention_tensors.append(scaled_attention)
+    return tuple(scaled_attention_tensors)
+def shorten_att(tensor_tuple, length=30):
+    shortend_tensors = []
+    for tensor in tensor_tuple:
+        shortend_tensors.append(tensor[:, :, :length, :length])
+    return tuple(shortend_tensors)
+def keep_top_k(matrix, k=6):
+    """
+    Keep only the top k values in each row, set the rest to 0.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+        k (int): The number of top values to keep in each row.
+    Returns:
+        numpy.ndarray: The transformed matrix.
+    """
+    topk_indices_per_row = np.argpartition(matrix, -k, axis=1)[:, -k:]
+    result_matrix = np.zeros_like(matrix)
+    for i in range(matrix.shape[0]):
+        result_matrix[i, topk_indices_per_row[i]] = matrix[
+            i, topk_indices_per_row[i]]
+    return result_matrix
+def test_case_forward_enc_perceiver_tf_dec_t5():
+    import torch
+    from model.ymt3 import YourMT3
+    from config.config import audio_cfg, model_cfg, shared_cfg
+    model_cfg["encoder_type"] = "perceiver-tf"
+    model_cfg["encoder"]["perceiver-tf"]["attention_to_channel"] = True
+    model_cfg["encoder"]["perceiver-tf"]["num_latents"] = 24
+    model_cfg["decoder_type"] = "t5"
+    model_cfg["pre_decoder_type"] = "default"
+    audio_cfg["codec"] = "spec"
+    audio_cfg["hop_length"] = 300
+    model = YourMT3(audio_cfg=audio_cfg, model_cfg=model_cfg)
+    model.eval()
+    # x = torch.randn(2, 1, 32767)
+    # labels = torch.randint(0, 400, (2, 1024), requires_grad=False)
+    # # forward
+    # output = model.forward(x, labels)
+    # # inference
+    # result = model.inference(x, None)
+    # display latents
+    checkpoint = torch.load(
+        "../logs/ymt3/ptf_all_cross_rebal5_spec300_xk2_amp0811_edr_005_attend_c_full_plus_b52/checkpoints/model.ckpt",
+        map_location="cpu")
+    state_dict = checkpoint['state_dict']
+    new_state_dict = {
+        k: v
+        for k, v in state_dict.items() if 'pitchshift' not in k
+    }
+    model.load_state_dict(new_state_dict, strict=False)
+    latents = model.encoder.latent_array.latents.detach().numpy()
+    import matplotlib.pyplot as plt
+    import numpy as np
+    from sklearn.metrics.pairwise import cosine_similarity
+    cos = cosine_similarity(latents)
+    from utils.data_modules import AMTDataModule
+    from einops import rearrange
+    dm = AMTDataModule(data_preset_multi={"presets": ["slakh"]})
+    dm.setup("test")
+    dl = dm.test_dataloader()
+    ds = list(dl.values())[0].dataset
+    audio, notes, tokens, _ = ds.__getitem__(7)
+    x = audio[[16], ::]
+    label = tokens[[16], :]
+    # spectrogram
+    x_spec = model.spectrogram(x)
+    plt.imshow(x_spec[0].detach().numpy().T, aspect='auto', origin='lower')
+    plt.title("spectrogram")
+    plt.xlabel('time step')
+    plt.ylabel('frequency bin')
+    plt.show()
+    x_conv = model.pre_encoder(x_spec)
+    # Create a larger figure
+    plt.figure(
+        figsize=(15,
+                 10))  # Adjust these numbers as needed for width and height
+    plt.subplot(2, 4, 1)
+    plt.imshow(x_spec[0].detach().numpy().T, aspect='auto', origin='lower')
+    plt.title("spectrogram")
+    plt.xlabel('time step')
+    plt.ylabel('frequency bin')
+    plt.subplot(2, 4, 2)
+    plt.imshow(x_conv[0][:, :, 0].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("conv(spec), ch=0")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 3)
+    plt.imshow(x_conv[0][:, :, 42].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=42")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 4)
+    plt.imshow(x_conv[0][:, :, 80].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=80")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 5)
+    plt.imshow(x_conv[0][:, :, 11].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=11")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 6)
+    plt.imshow(x_conv[0][:, :, 20].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=20")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 7)
+    plt.imshow(x_conv[0][:, :, 77].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=77")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 8)
+    plt.imshow(x_conv[0][:, :, 90].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=90")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.tight_layout()
+    plt.show()
+    # encoding
+    output = model.encoder(inputs_embeds=x_conv,
+                           output_hidden_states=True,
+                           output_attentions=True)
+    enc_hs_all, att, catt = output["hidden_states"], output[
+        "attentions"], output["cross_attentions"]
+    enc_hs_last = enc_hs_all[2]
+    # enc_hs: time-varying encoder hidden state
+    plt.subplot(2, 3, 1)
+    plt.imshow(enc_hs_all[0][0][:, :, 21].detach().numpy().T)
+    plt.title('ENC_HS B0, d21')
+    plt.colorbar(orientation='horizontal')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 4)
+    plt.imshow(enc_hs_all[0][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B0, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 2)
+    plt.imshow(enc_hs_all[1][0][:, :, 21].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B1, d21')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 5)
+    plt.imshow(enc_hs_all[1][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B1, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 3)
+    plt.imshow(enc_hs_all[2][0][:, :, 21].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B2, d21')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 6)
+    plt.imshow(enc_hs_all[2][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B2, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.tight_layout()
+    plt.show()
+    enc_hs_proj = model.pre_decoder(enc_hs_last)
+    plt.imshow(enc_hs_proj[0].detach().numpy())
+    plt.title(
+        'ENC_HS_PROJ: linear projection of encoder output, which is used for enc-dec cross attention'
+    )
+    plt.colorbar(orientation='horizontal')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.show()
+    plt.subplot(221)
+    plt.imshow(enc_hs_all[2][0][0, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=0')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(222)
+    plt.imshow(enc_hs_all[2][0][10, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=10')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(223)
+    plt.imshow(enc_hs_all[2][0][20, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=20')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(224)
+    plt.imshow(enc_hs_all[2][0][30, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=30')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.tight_layout()
+    plt.show()
+    # enc_hs correlation: which dim has most unique info?
+    plt.subplot(1, 3, 1)
+    a = rearrange(enc_hs_last, '1 t k d -> t (k d)').detach().numpy()
+    plt.imshow(cosine_similarity(a))
+    plt.title("enc hs, t x t cos_sim")
+    plt.subplot(1, 3, 2)
+    b = rearrange(enc_hs_last, '1 t k d -> k (t d)').detach().numpy()
+    plt.imshow(cosine_similarity(b))
+    plt.title("enc hs, k x k cos_sim")
+    plt.subplot(1, 3, 3)
+    c = rearrange(enc_hs_last, '1 t k d -> d (k t)').detach().numpy()
+    plt.imshow(cosine_similarity(c))
+    plt.title("cross att, d x d cos_sim")
+    plt.tight_layout()
+    plt.show()
+    # enc latent
+    plt.imshow(model.encoder.latent_array.latents.detach().numpy())
+    plt.title('latent array')
+    plt.xlabel('d')
+    plt.ylabel('latent k')
+    plt.show()
+    # enc Spectral Cross Attention: (T x head x K x D). How latent K attends to conv channel C?
+    plt.subplot(311)
+    plt.imshow(
+        torch.sum(torch.sum(catt[0][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=0')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    plt.subplot(312)
+    plt.imshow(
+        torch.sum(torch.sum(catt[1][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=1')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    plt.subplot(313)
+    plt.imshow(
+        torch.sum(torch.sum(catt[2][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=2')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    plt.tight_layout()
+    plt.show()
+    # enc Latent Self-attention: How latent K attends to K?
+    plt.subplot(231)
+    plt.imshow(torch.sum(torch.sum(att[0][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(234)
+    plt.imshow(torch.sum(torch.sum(att[0][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(232)
+    plt.imshow(torch.sum(torch.sum(att[1][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(235)
+    plt.imshow(torch.sum(torch.sum(att[1][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(233)
+    plt.imshow(torch.sum(torch.sum(att[2][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(236)
+    plt.imshow(torch.sum(torch.sum(att[2][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.tight_layout()
+    plt.show()
+    # Time varying, different head for latent self-attention
+    plt.subplot(231)
+    plt.imshow(att[0][0][30, 3, :, :].detach().numpy())
+    plt.title('B0L0, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(234)
+    plt.imshow(att[0][1][30, 3, :, :].detach().numpy())
+    plt.title('B0L1, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(232)
+    plt.imshow(att[1][0][30, 3, :, :].detach().numpy())
+    plt.title('B1L0, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(235)
+    plt.imshow(att[1][1][30, 3, :, :].detach().numpy())
+    plt.title('B1L1, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(233)
+    plt.imshow(att[2][0][30, 3, :, :].detach().numpy())
+    plt.title('B2L0, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(236)
+    plt.imshow(att[2][1][30, 3, :, :].detach().numpy())
+    plt.title('B2L1, t=30, Head=3')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.tight_layout()
+    plt.show()
+    plt.subplot(231)
+    plt.imshow(att[0][0][30, 5, :, :].detach().numpy())
+    plt.title('B0L0, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(234)
+    plt.imshow(att[0][1][30, 5, :, :].detach().numpy())
+    plt.title('B0L1, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(232)
+    plt.imshow(att[1][0][30, 5, :, :].detach().numpy())
+    plt.title('B1L0, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(235)
+    plt.imshow(att[1][1][30, 5, :, :].detach().numpy())
+    plt.title('B1L1, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(233)
+    plt.imshow(att[2][0][30, 5, :, :].detach().numpy())
+    plt.title('B2L0, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.subplot(236)
+    plt.imshow(att[2][1][30, 5, :, :].detach().numpy())
+    plt.title('B2L1, t=30, Head=5')
+    plt.colorbar(orientation='horizontal')
+    plt.xlabel('k')
+    plt.ylabel('k')
+    plt.tight_layout()
+    plt.show()
+    # Temporal Self-attention: (K x H x T x T) How time t attends to time t?
+    plt.subplot(231)
+    plt.imshow(torch.sum(torch.sum(att[0][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(234)
+    plt.imshow(torch.sum(torch.sum(att[0][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(232)
+    plt.imshow(torch.sum(torch.sum(att[1][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(235)
+    plt.imshow(torch.sum(torch.sum(att[1][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(233)
+    plt.imshow(torch.sum(torch.sum(att[2][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(236)
+    plt.imshow(torch.sum(torch.sum(att[2][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.tight_layout()
+    plt.show()
+    # decoding
+    dec_input_ids = model.shift_right_fn(label)
+    dec_inputs_embeds = model.embed_tokens(dec_input_ids)
+    dec_output = model.decoder(inputs_embeds=dec_inputs_embeds,
+                               encoder_hidden_states=enc_hs_proj,
+                               output_attentions=True,
+                               output_hidden_states=True,
+                               return_dict=True)
+    dec_att, dec_catt = dec_output.attentions, dec_output.cross_attentions
+    dec_hs_all = dec_output.hidden_states
+    # dec att
+    plt.subplot(1, 2, 1)
+    plt.imshow(torch.sum(dec_att[0][0], axis=0).detach().numpy())
+    plt.title('decoder attention, layer0')
+    plt.xlabel('decoder time step')
+    plt.ylabel('decoder time step')
+    plt.subplot(1, 2, 2)
+    plt.imshow(torch.sum(dec_att[7][0], axis=0).detach().numpy())
+    plt.title('decoder attention, layer8')
+    plt.xlabel('decoder time step')
+    plt.show()
+    # dec catt
+    plt.imshow(np.rot90((torch.sum(dec_catt[7][0],
+                                   axis=0))[:1000, :].detach().numpy()),
+               origin='upper',
+               aspect='auto')
+    plt.colorbar()
+    plt.title('decoder cross att, layer8')
+    plt.xlabel('decoder time step')
+    plt.ylabel('encoder frame')
+    plt.show()
+    # dec catt by head with xxx
+    dec_att_z = z_normalize_tensors(shorten_att(dec_att))
+    plt.imshow(dec_att_z[0][0, 0, :, :].detach().numpy())
+    from bertviz import head_view
+    token = []
+    for i in label[0, :30]:
+        token.append(str(i))
+    head_view(dec_att_z, tokens)
+    # dec_hs
+    plt.subplot(1, 2, 1)
+    plt.imshow(dec_hs_all[0][0].detach().numpy(), origin='upper')
+    plt.colorbar(orientation='horizontal')
+    plt.title('decoder hidden state, layer1')
+    plt.xlabel('hidden dim')
+    plt.ylabel('time step')
+    plt.subplot(1, 2, 2)
+    plt.imshow(dec_hs_all[7][0].detach().numpy(), origin='upper')
+    plt.colorbar(orientation='horizontal')
+    plt.title('decoder hidden state, layer8')
+    plt.xlabel('hidden dim')
+    plt.show()
+    # lm head
+    logits = model.lm_head(dec_hs_all[0])
+    plt.imshow(logits[0][0:200, :].detach().numpy(), origin='upper')
+    plt.title('lm head softmax')
+    plt.xlabel('vocab dim')
+    plt.ylabel('time step')
+    plt.xlim([1000, 1350])
+    plt.show()
+    softmax = torch.nn.Softmax(dim=2)
+    logits_sm = softmax(logits)
+    plt.imshow(logits_sm[0][0:200, :].detach().numpy(), origin='upper')
+    plt.title('lm head softmax')
+    plt.xlabel('vocab dim')
+    plt.ylabel('time step')
+    plt.xlim([1000, 1350])
+    plt.show()

amt/src/extras/perceivertf_multi_inspect.py ADDED Viewed

	@@ -0,0 +1,778 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+from matplotlib.animation import FuncAnimation
+def l2_normalize(matrix):
+    """
+    L2 Normalize the matrix along its rows.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+    Returns:
+        numpy.ndarray: The L2 normalized matrix.
+    """
+    l2_norms = np.linalg.norm(matrix, axis=1, keepdims=True)
+    normalized_matrix = matrix / l2_norms
+    return normalized_matrix
+def z_normalize(matrix):
+    """
+    Z-normalize the matrix along its rows (mean=0 and std=1).
+    Z-normalization is also known as "standardization", and derives from z-score.
+    Z = (X - mean) / std
+    Z-nomarlized, each row has mean=0 and std=1.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+    Returns:
+        numpy.ndarray: The Z normalized matrix.
+    """
+    mean = np.mean(matrix, axis=1, keepdims=True)
+    std = np.std(matrix, axis=1, keepdims=True)
+    normalized_matrix = (matrix - mean) / std
+    return normalized_matrix
+def l2_normalize_tensors(tensor_tuple):
+    """
+    Applies L2 normalization on the last two dimensions for each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors, each of shape (1, k, 30, 30).
+    Returns:
+        tuple of torch.Tensor: A tuple containing N L2-normalized tensors.
+    """
+    normalized_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Calculate L2 norm on the last two dimensions, keeping the dimensions using keepdim=True
+        l2_norm = torch.linalg.norm(tensor, dim=(-2, -1), keepdim=True)
+        # Apply L2 normalization
+        normalized_tensor = tensor / (
+            l2_norm + 1e-7)  # Small value to avoid division by zero
+        normalized_tensors.append(normalized_tensor)
+    return tuple(normalized_tensors)
+def z_normalize_tensors(tensor_tuple):
+    """
+    Applies Z-normalization on the last two dimensions for each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors, each of shape (1, k, 30, 30).
+    Returns:
+        tuple of torch.Tensor: A tuple containing N Z-normalized tensors.
+    """
+    normalized_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Calculate mean and std on the last two dimensions
+        mean = tensor.mean(dim=(-2, -1), keepdim=True)
+        std = tensor.std(dim=(-2, -1), keepdim=True)
+        # Apply Z-normalization
+        normalized_tensor = (tensor - mean) / (
+            std + 1e-7)  # Small value to avoid division by zero
+        normalized_tensors.append(normalized_tensor)
+    return tuple(normalized_tensors)
+def apply_temperature_to_attention_tensors(tensor_tuple, temperature=1.0):
+    """
+    Applies temperature scaling to the attention weights in each tensor in a tuple.
+    Parameters:
+        tensor_tuple (tuple of torch.Tensor): A tuple containing N tensors,
+                                             each of shape (1, k, 30, 30).
+        temperature (float): Temperature parameter to control the sharpness
+                             of the attention weights. Default is 1.0.
+    Returns:
+        tuple of torch.Tensor: A tuple containing N tensors with scaled attention weights.
+    """
+    scaled_attention_tensors = []
+    for tensor in tensor_tuple:
+        # Ensure the tensor is a floating-point type
+        tensor = tensor.float()
+        # Flatten the last two dimensions
+        flattened_tensor = tensor.reshape(1, tensor.shape[1],
+                                          -1)  # Modified line here
+        # Apply temperature scaling and softmax along the last dimension
+        scaled_attention = flattened_tensor / temperature
+        scaled_attention = F.softmax(scaled_attention, dim=-1)
+        # Reshape to original shape
+        scaled_attention = scaled_attention.view_as(tensor)
+        scaled_attention_tensors.append(scaled_attention)
+    return tuple(scaled_attention_tensors)
+def shorten_att(tensor_tuple, length=30):
+    shortend_tensors = []
+    for tensor in tensor_tuple:
+        shortend_tensors.append(tensor[:, :, :length, :length])
+    return tuple(shortend_tensors)
+def keep_top_k(matrix, k=6):
+    """
+    Keep only the top k values in each row, set the rest to 0.
+    Parameters:
+        matrix (numpy.ndarray): The input matrix.
+        k (int): The number of top values to keep in each row.
+    Returns:
+        numpy.ndarray: The transformed matrix.
+    """
+    topk_indices_per_row = np.argpartition(matrix, -k, axis=1)[:, -k:]
+    result_matrix = np.zeros_like(matrix)
+    for i in range(matrix.shape[0]):
+        result_matrix[i, topk_indices_per_row[i]] = matrix[
+            i, topk_indices_per_row[i]]
+    return result_matrix
+def test_case_forward_enc_perceiver_tf_dec_multi_t5():
+    import torch
+    from model.ymt3 import YourMT3
+    from config.config import audio_cfg, model_cfg, shared_cfg
+    model_cfg["encoder_type"] = "perceiver-tf"
+    model_cfg["encoder"]["perceiver-tf"]["attention_to_channel"] = True
+    model_cfg["encoder"]["perceiver-tf"]["num_latents"] = 26
+    model_cfg["decoder_type"] = "multi-t5"
+    audio_cfg["codec"] = "spec"
+    audio_cfg["hop_length"] = 300
+    model = YourMT3(audio_cfg=audio_cfg, model_cfg=model_cfg)
+    model.eval()
+    # x = torch.randn(2, 1, 32767)
+    # labels = torch.randint(0, 400, (2, 1024), requires_grad=False)
+    # # forward
+    # output = model.forward(x, labels)
+    # # inference
+    # result = model.inference(x, None)
+    # display latents
+    checkpoint = torch.load(
+        "../logs/ymt3/ptf_mc13_256_all_cross_v6_xk5_amp0811_edr005_attend_c_full_plus_2psn_nl26_sb_b26r_800k/checkpoints/model.ckpt",
+        map_location="cpu")
+    state_dict = checkpoint['state_dict']
+    new_state_dict = {
+        k: v
+        for k, v in state_dict.items() if 'pitchshift' not in k
+    }
+    model.load_state_dict(new_state_dict, strict=False)
+    latents = model.encoder.latent_array.latents.detach().numpy()
+    import matplotlib.pyplot as plt
+    import numpy as np
+    from sklearn.metrics.pairwise import cosine_similarity
+    cos = cosine_similarity(latents)
+    from utils.data_modules import AMTDataModule
+    from einops import rearrange
+    # dm = AMTDataModule(data_preset_multi={"presets": ["slakh"]})
+    #dm.setup("test")
+    # dl = dm.test_dataloader()
+    # ds = list(dl.values())[0].dataset
+    # audio, notes, tokens, _ = ds.__getitem__(7)
+    # x = audio[[16], ::]
+    # label = tokens[[16], :]
+    # from utils.task_manager import TaskManager
+    # tm = TaskManager(task_name='mc13_256')
+    # dm = AMTDataModule(data_preset_multi={"presets": ["slakh"]},
+    #                    task_manager=tm,
+    #                    train_stem_iaug_prob=None,
+    #                    train_stem_xaug_policy=None)
+    # dm.setup('fit')
+    # dl = dm.train_dataloader()
+    # ds = dl.flattened[0].dataset
+    # audio,tokens, _, _ = ds.__getitem__(67)
+    # x = audio[[5], ::]
+    # label = tokens[[5], :]
+    # save audio
+    # torchaudio.save("singing.wav", x[0, :, :], 16000)
+    x, _ = torchaudio.load('piano.wav')#'test.wav')
+    x = x.unsqueeze(0)
+    # spectrogram
+    x_spec = model.spectrogram(x)
+    x_conv = model.pre_encoder(x_spec)
+    # Create a larger figure
+    plt.figure(
+        figsize=(15,
+                 10))  # Adjust these numbers as needed for width and height
+    plt.subplot(2, 4, 1)
+    plt.imshow(x_spec[0].detach().numpy().T, aspect='auto', origin='lower')
+    plt.title("spectrogram")
+    plt.xlabel('time step')
+    plt.ylabel('frequency bin')
+    plt.subplot(2, 4, 2)
+    plt.imshow(x_conv[0][:, :, 0].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("conv(spec), ch=0")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 3)
+    plt.imshow(x_conv[0][:, :, 42].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=42")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 4)
+    plt.imshow(x_conv[0][:, :, 80].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=80")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 5)
+    plt.imshow(x_conv[0][:, :, 11].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=11")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 6)
+    plt.imshow(x_conv[0][:, :, 20].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=20")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 7)
+    plt.imshow(x_conv[0][:, :, 77].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=77")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.subplot(2, 4, 8)
+    plt.imshow(x_conv[0][:, :, 90].detach().numpy().T,
+               aspect='auto',
+               origin='lower')
+    plt.title("ch=90")
+    plt.xlabel('time step')
+    plt.ylabel('F')
+    plt.tight_layout()
+    plt.show()
+    # encoding
+    output = model.encoder(inputs_embeds=x_conv,
+                           output_hidden_states=True,
+                           output_attentions=True)
+    enc_hs_all, att, catt = output["hidden_states"], output[
+        "attentions"], output["cross_attentions"]
+    enc_hs_last = enc_hs_all[2]
+    # enc_hs: time-varying encoder hidden state
+    plt.subplot(2, 3, 1)
+    plt.imshow(enc_hs_all[0][0][:, :, 21].detach().numpy().T)
+    plt.title('ENC_HS B0, d21')
+    plt.colorbar(orientation='horizontal')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 4)
+    plt.imshow(enc_hs_all[0][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B0, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 2)
+    plt.imshow(enc_hs_all[1][0][:, :, 21].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B1, d21')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 5)
+    plt.imshow(enc_hs_all[1][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B1, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 3)
+    plt.imshow(enc_hs_all[2][0][:, :, 21].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B2, d21')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.subplot(2, 3, 6)
+    plt.imshow(enc_hs_all[2][0][:, :, 127].detach().numpy().T)
+    plt.colorbar(orientation='horizontal')
+    plt.title('B2, d127')
+    plt.ylabel('latent k')
+    plt.xlabel('t')
+    plt.tight_layout()
+    plt.show()
+    # enc_hs: time-varying encoder hidden state by k (block, 1, t, k, d)
+    # --> (t, d) for each k in last block
+    data = enc_hs_all[2][0].detach().numpy()  # (T, K, D)
+    fig, axs = plt.subplots(
+        5, 5, figsize=(10, 9))  # 25 subplots arranged in 5 rows and 5 columns
+    axs = axs.flatten(
+    )  # Flatten the 2D array of axes to 1D for easy iteration
+    for k in range(25):  # Iterating through K indices from 0 to 24
+        axs[k].imshow(data[:, k, :].T,
+                      cmap='viridis')  # Transposing the matrix to swap T and D
+        axs[k].set_title(f'k={k}')
+        axs[k].set_xlabel('Time step')
+        axs[k].set_ylabel('Dim')
+    # Adjusting layout for better visibility
+    plt.tight_layout()
+    plt.show()
+    #!! Projected encoder hidden state for 13 channels, that is conditioning for decoder
+    enc_hs_proj = model.pre_decoder(enc_hs_last)
+    fig, axs = plt.subplots(1, 13, figsize=(26, 8))  # 13 subplots in a row
+    data = enc_hs_proj[0].detach().numpy()
+    for ch in range(13):
+        axs[ch].imshow(np.rot90(data[ch]), cmap='viridis')  # Rotate 90 degrees
+        axs[ch].set_title(f'ch: {ch}')
+        axs[ch].set_xlabel('Time step')
+        axs[ch].set_ylabel('Dim')
+    plt.suptitle(
+        'linear projection of encoder outputs by channel, which is conditioning for enc-dec cross attention',
+        y=0.1,
+        fontsize=12)
+    plt.tight_layout(rect=[0, 0.1, 1, 1])
+    plt.show()
+    plt.subplot(221)
+    plt.imshow(enc_hs_all[2][0][0, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=0')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(222)
+    plt.imshow(enc_hs_all[2][0][10, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=10')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(223)
+    plt.imshow(enc_hs_all[2][0][20, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=20')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.subplot(224)
+    plt.imshow(enc_hs_all[2][0][30, :, :].detach().numpy(), aspect='auto')
+    plt.title('enc_hs, t=30')
+    plt.ylabel('latent k')
+    plt.xlabel('d')
+    plt.tight_layout()
+    plt.show()
+    # enc_hs correlation: which dim has most unique info?
+    plt.subplot(1, 3, 1)
+    a = rearrange(enc_hs_last, '1 t k d -> t (k d)').detach().numpy()
+    plt.imshow(cosine_similarity(a))
+    plt.title("enc hs, t x t cos_sim")
+    plt.subplot(1, 3, 2)
+    b = rearrange(enc_hs_last, '1 t k d -> k (t d)').detach().numpy()
+    plt.imshow(cosine_similarity(b))
+    plt.title("enc hs, k x k cos_sim")
+    plt.subplot(1, 3, 3)
+    c = rearrange(enc_hs_last, '1 t k d -> d (k t)').detach().numpy()
+    plt.imshow(cosine_similarity(c))
+    plt.title("cross att, d x d cos_sim")
+    plt.tight_layout()
+    plt.show()
+    #!! enc latent
+    plt.imshow(model.encoder.latent_array.latents.detach().numpy())
+    plt.title('latent array')
+    plt.xlabel('d')
+    plt.ylabel('latent k')
+    plt.show()
+    #!! enc Spectral Cross Attention: (T x head x K x D). How latent K attends to conv channel C?
+    plt.subplot(311)
+    plt.imshow(
+        torch.sum(torch.sum(catt[0][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=0')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    plt.subplot(312)
+    plt.imshow(
+        torch.sum(torch.sum(catt[1][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=1')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    plt.subplot(313)
+    plt.imshow(
+        torch.sum(torch.sum(catt[2][0], axis=0), axis=0).detach().numpy())
+    plt.title('block=2')
+    plt.ylabel('latent k')
+    plt.xlabel('conv channel')
+    # f'spectral cross attention. T-C-F Model',
+    # y=0,
+    # fontsize=12)
+    plt.tight_layout()
+    plt.show()
+    #!! Animation of SCA for varying time, head in last block
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))  # Adjusted figsize for better layout
+    # Function to update the plots for each frame in the animation
+    def update(t):
+        # Clear previous images
+        ax1.clear()
+        ax2.clear()
+        # Update subplot for h=3
+        ax1.imshow(catt[2][0][t, 3, :, :].detach().numpy())
+        ax1.set_title(f'block=2, t={t}, head=3')
+        ax1.set_ylabel('latent k'); ax1.set_xlabel('conv channel')
+        # Update subplot for h=5
+        ax2.imshow(catt[2][0][t, 5, :, :].detach().numpy())
+        ax2.set_title(f'block=2, t={t}, head=5')
+        ax2.set_ylabel('latent k'); ax2.set_xlabel('conv channel')
+        # Adjust layout
+        fig.tight_layout()
+    # Create the animation
+    anim = FuncAnimation(fig, update, frames=range(0, 110), interval=200)
+    anim.save('animation.gif', writer='pillow', fps=5)
+    fig, axs = plt.subplots(3, 1, figsize=(12, 18), gridspec_kw={'height_ratios': [1, 1, 0.5]})  # Adjusted for different subplot sizes
+    # Subplots for catt visualization (h=3 and h=5)
+    ax_catt3, ax_catt5, ax_att_row = axs
+    # Creating 8 subplots for att visualization within the third row
+    for i in range(8):
+        ax_att_row = fig.add_subplot(3, 8, 17 + i)  # Adding subplots in the third row
+    # Update function for the combined animation
+    def combined_update_smaller_att(t):
+        # Update subplot for catt with h=3
+        ax_catt3.clear()
+        ax_catt3.imshow(catt[2][0][t, 3, :, :].detach().numpy())
+        ax_catt3.set_title(f'block=2, t={t}, head=3')
+        ax_catt3.set_ylabel('latent k'); ax_catt3.set_xlabel('conv channel')
+        # Update subplot for catt with h=5
+        ax_catt5.clear()
+        ax_catt5.imshow(catt[2][0][t, 5, :, :].detach().numpy())
+        ax_catt5.set_title(f'block=2, t={t}, head=5')
+        ax_catt5.set_ylabel('latent k'); ax_catt5.set_xlabel('conv channel')
+        # Update subplots for att (8 heads in one row)
+        for i in range(8):
+            ax = fig.add_subplot(3, 8, 17 + i)
+            ax.clear()
+            ax.imshow(att[0][1][t, i, :, :].detach().numpy(), cmap='viridis')
+            ax.set_title(f't={t}, head={i}')
+            ax.set_xlabel('k')
+            ax.set_ylabel('k')
+            ax.axis('square')  # Make each subplot square-shaped
+        # Adjust layout
+        fig.tight_layout()
+    combined_anim_smaller_att = FuncAnimation(fig, combined_update_smaller_att, frames=range(0, 110), interval=200)
+    combined_anim_smaller_att.save('combined_animation_smaller_att.gif', writer='pillow', fps=5)
+    # enc Latent Self-attention: How latent K attends to K?
+    plt.subplot(231)
+    plt.imshow(torch.sum(torch.sum(att[0][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(234)
+    plt.imshow(torch.sum(torch.sum(att[0][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(232)
+    plt.imshow(torch.sum(torch.sum(att[1][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(235)
+    plt.imshow(torch.sum(torch.sum(att[1][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(233)
+    plt.imshow(torch.sum(torch.sum(att[2][0], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L0')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.subplot(236)
+    plt.imshow(torch.sum(torch.sum(att[2][1], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L1')
+    plt.xlabel('latent k')
+    plt.ylabel('latent k')
+    plt.tight_layout()
+    plt.show()
+    # Time varying, different head for latent self-attention
+    #!!! Display latent self-attention for each head
+    bl = 0  # first latent transformer block, last layer att
+    data = att[bl][1].detach().numpy()
+    time_steps = [30, 50, 100]
+    fig, axs = plt.subplots(
+        len(time_steps), 8,
+        figsize=(16, 6))  # Subplots for each time step and head
+    for i, t in enumerate(time_steps):
+        for head in range(8):
+            axs[i, head].imshow(data[t, head, :, :], cmap='viridis')
+            axs[i, head].set_title(f't={t}, head={head}')
+            axs[i, head].set_xlabel('k')
+            axs[i, head].set_ylabel('k')
+    plt.suptitle(
+        f'latent transformer block={bl}, last layer self-attention over time',
+        y=0,
+        fontsize=12)
+    plt.tight_layout()
+    plt.show()
+    bl = 1  # second latent transformer block, last layer att
+    data = att[bl][1].detach().numpy()
+    time_steps = [30, 50, 100]
+    fig, axs = plt.subplots(
+        len(time_steps), 8,
+        figsize=(16, 6))  # Subplots for each time step and head
+    for i, t in enumerate(time_steps):
+        for head in range(8):
+            axs[i, head].imshow(data[t, head, :, :], cmap='viridis')
+            axs[i, head].set_title(f't={t}, head={head}')
+            axs[i, head].set_xlabel('k')
+            axs[i, head].set_ylabel('k')
+    plt.suptitle(
+        f'latent transformer block={bl}, last layer self-attention over time',
+        y=0,
+        fontsize=12)
+    plt.tight_layout()
+    plt.show()
+    bl = 2  # last latent transformer block, last layer att
+    data = att[bl][1].detach().numpy()
+    time_steps = [30, 50, 100]
+    fig, axs = plt.subplots(
+        len(time_steps), 8,
+        figsize=(16, 6))  # Subplots for each time step and head
+    for i, t in enumerate(time_steps):
+        for head in range(8):
+            axs[i, head].imshow(data[t, head, :, :], cmap='viridis')
+            axs[i, head].set_title(f't={t}, head={head}')
+            axs[i, head].set_xlabel('k')
+            axs[i, head].set_ylabel('k')
+    plt.suptitle(
+        f'latent transformer block={bl}, last layer self-attention over time',
+        y=0,
+        fontsize=12)
+    plt.tight_layout()
+    plt.show()
+    # Temporal Self-attention: (K x H x T x T) How time t attends to time t?
+    plt.subplot(231)
+    plt.imshow(torch.sum(torch.sum(att[0][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(234)
+    plt.imshow(torch.sum(torch.sum(att[0][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B0L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(232)
+    plt.imshow(torch.sum(torch.sum(att[1][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(235)
+    plt.imshow(torch.sum(torch.sum(att[1][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B1L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(233)
+    plt.imshow(torch.sum(torch.sum(att[2][2], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L2')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.subplot(236)
+    plt.imshow(torch.sum(torch.sum(att[2][3], axis=1),
+                         axis=0).detach().numpy(),
+               origin='upper')
+    plt.title('B2L3')
+    plt.xlabel('t')
+    plt.ylabel('t')
+    plt.tight_layout()
+    plt.show()
+    # decoding
+    dec_input_ids = model.shift_right_fn(label)
+    dec_inputs_embeds = model.embed_tokens(dec_input_ids)
+    dec_output = model.decoder(inputs_embeds=dec_inputs_embeds,
+                               encoder_hidden_states=enc_hs_proj,
+                               output_attentions=True,
+                               output_hidden_states=True,
+                               return_dict=True)
+    dec_att, dec_catt = dec_output.attentions, dec_output.cross_attentions
+    dec_hs_all = dec_output.hidden_states
+    dec_last_hs = dec_output.last_hidden_state
+    # lm head
+    logits = model.lm_head(dec_last_hs)
+    # pred ids
+    pred_ids = torch.argmax(logits, dim=3)
+    # dec att
+    plt.subplot(1, 2, 1)
+    plt.imshow(torch.sum(dec_att[5][0], axis=0).detach().numpy())
+    plt.title('decoder attention, layer0')
+    plt.xlabel('decoder time step')
+    plt.ylabel('decoder time step')
+    plt.subplot(1, 2, 2)
+    plt.imshow(torch.sum(dec_att[7][0], axis=0).detach().numpy())
+    plt.title('decoder attention, final layer')
+    plt.xlabel('decoder step')
+    plt.show()
+    # dec catt
+    def remove_values_after_eos(catt_np, pred_ids, max_k):
+        # catt_np: (k, head, t, t)
+        # pred_ids: (1, k, t))
+        max_length = pred_ids.shape[-1]
+        seq_lengths = np.zeros((max_k), dtype=np.int32)
+        for k in range(max_k):
+            for t in range(max_length):
+                if pred_ids[0, k, t] == 1:
+                    break
+            catt_np[k, :, t+1:, :] = 0
+            # catt_np[k, :, :, t+1:] = 0
+            seq_lengths[k] = t+1
+        return catt_np, seq_lengths
+    # data = dec_catt[1].detach().numpy() # last layer's cross attention
+    l = 4
+    data = dec_catt[l].detach().numpy()
+    data, seq_lengths = remove_values_after_eos(data, pred_ids, max_k=13)
+    seq_lengths[:]= 256
+    fig, axs = plt.subplots(13, 6, figsize=(21, 39))  # 13 rows (for k=0:12) and 7 columns (for head=0:6)
+    for k in range(13):
+        s = seq_lengths[k]
+        for head in range(6):
+            axs[k, head].imshow(data[k, head, :s, :].T, aspect='auto', cmap='viridis')
+            axs[k, head].set_title(f'Layer {l}, k={k}, head={head}')
+            axs[k, head].set_xlabel('Decoder step')
+            axs[k, head].set_ylabel('Encoder frame')
+    plt.tight_layout()
+    plt.show()
+    # # dec catt by head with xxx
+    # dec_att_z = z_normalize_tensors(shorten_att(dec_att))
+    # plt.imshow(dec_att_z[0][0, 0, :, :].detach().numpy())
+    # from bertviz import head_view
+    # token = []
+    # for i in label[0, :30]:
+    #     token.append(str(i))
+    # head_view(dec_att_z, tokens)
+    # dec_hs
+    plt.subplot(1, 2, 1)
+    k=2
+    plt.imshow(dec_last_hs[0][k].detach().numpy(), origin='upper')
+    plt.colorbar(orientation='horizontal')
+    plt.title('decoder last hidden state, k=0')
+    plt.xlabel('hidden dim')
+    plt.ylabel('time step')
+    plt.subplot(1, 2, 2)
+    k=12
+    plt.imshow(dec_last_hs[0][k].detach().numpy(), origin='upper')
+    plt.colorbar(orientation='horizontal')
+    plt.title('decoder last hidden state, k=12')
+    plt.xlabel('hidden dim')
+    plt.show()
+    # lm head
+    logits = model.lm_head(dec_last_hs)
+    k=6
+    plt.imshow(logits[0][k][0:200, :].detach().numpy().T, origin='upper')
+    plt.title('lm head output')
+    plt.xlabel('vocab dim')
+    plt.ylabel('time step')
+    plt.show()
+    softmax = torch.nn.Softmax(dim=3)
+    logits_sm = softmax(logits) # B, K, T, V
+    k=6
+    plt.imshow(logits_sm[0][k][:255, :].detach().numpy().T, origin='upper')
+    plt.title('lm head softmax')
+    plt.xlabel('vocab dim')
+    plt.ylabel('time step')
+    # plt.xlim([1000, 1350])
+    plt.show()
+    k = 10
+    print(torch.argmax(logits, dim=3)[0,k,:])

amt/src/extras/pitch_shift_benchmark.py ADDED Viewed

	@@ -0,0 +1,167 @@

+""" Test the speed of the augmentation """
+import torch
+import torchaudio
+# Device
+device = torch.device("cuda")
+# device = torch.device("cpu")
+# Music
+# x, _ = torchaudio.load("music.wav")
+# slice_length = 32767
+# n_slices = 80
+# slices = [x[0, i * slice_length:(i + 1) * slice_length] for i in range(n_slices)]
+# x = torch.stack(slices)  # (80, 32767)
+# Sine wave
+t = torch.arange(0, 2.0479, 1 / 16000)  # 2.05 seconds at 16kHz
+x = torch.sin(2 * torch.pi * 440 * t) * 0.5
+x = x.reshape(1, 1, 32767).tile(80, 1, 1)
+x = x.to(device)
+############################################################################################
+# torch-audiomentation: https://github.com/asteroid-team/torch-audiomentation
+#
+# process time <CPU>: 1.18 s ± 5.35 ms
+# process time <GPU>: 58 ms
+# GPU memory usage: 3.8 GB per 1 semitone
+############################################################################################
+import torch
+from torch_audiomentations import Compose, PitchShift, Gain, PolarityInversion
+apply_augmentation = Compose(transforms=[
+    # Gain(
+    #     min_gain_in_db=-15.0,
+    #     max_gain_in_db=5.0,
+    #     p=0.5,
+    # ),
+    # PolarityInversion(p=0.5)
+    PitchShift(
+        min_transpose_semitones=0,
+        max_transpose_semitones=2.2,
+        mode="per_batch",  #"per_example",
+        p=1.0,
+        p_mode="per_batch",
+        sample_rate=16000,
+        target_rate=16000)
+])
+x_am = apply_augmentation(x, sample_rate=16000)
+############################################################################################
+# torchaudio:
+#
+# process time <CPU>: 4.01 s ± 19.6 ms per loop
+# process time <GPU>: 25.1 ms ± 161 µs per loop
+# memory usage <GPU>: 1.2 (growth to 5.49) GB per 1 semitone
+############################################################################################
+from torchaudio import transforms
+ta_transform = transforms.PitchShift(16000, n_steps=2).to(device)
+x_ta = ta_transform(x)
+############################################################################################
+# YourMT3 pitch_shift_layer:
+#
+# process time <CPU>: 389ms ± 22ms, (stretch=143 ms, resampler=245 ms)
+# process time <GPU>: 7.18 ms ± 17.3 µs (stretch=6.47 ms, resampler=0.71 ms)
+# memory usage: 16 MB per 1 semitone (average)
+############################################################################################
+from model.pitchshift_layer import PitchShiftLayer
+ps_ymt3 = PitchShiftLayer(pshift_range=[2, 2], fs=16000, min_gcd=16, n_fft=2048).to(device)
+x_ymt3 = ps_ymt3(x, 2)
+############################################################################################
+# Plot 1: Comparison of Process Time and GPU Memory Usage for 3 Pitch Shifting Methods
+############################################################################################
+import matplotlib.pyplot as plt
+# Model names
+models = ['torch-audiomentation', 'torchaudio', 'YourMT3:PitchShiftLayer']
+# Process time (CPU) in seconds
+cpu_time = [1.18, 4.01, 0.389]
+# Process time (GPU) in milliseconds
+gpu_time = [58, 25.1, 7.18]
+# GPU memory usage in GB
+gpu_memory = [3.8, 5.49, 0.016]
+# Creating subplots
+fig, axs = plt.subplots(1, 3, figsize=(15, 5))
+# Creating bar charts
+bar1 = axs[0].bar(models, cpu_time, color=['#FFB6C1', '#ADD8E6', '#98FB98'])
+bar2 = axs[1].bar(models, gpu_time, color=['#FFB6C1', '#ADD8E6', '#98FB98'])
+bar3 = axs[2].bar(models, gpu_memory, color=['#FFB6C1', '#ADD8E6', '#98FB98'])
+# Adding labels and titles
+axs[0].set_ylabel('Time (s)')
+axs[0].set_title('Process Time (CPU) bsz=80')
+axs[1].set_ylabel('Time (ms)')
+axs[1].set_title('Process Time (GPU) bsz=80')
+axs[2].set_ylabel('Memory (GB)')
+axs[2].set_title('GPU Memory Usage per semitone')
+# Adding grid for better readability of the plots
+for ax in axs:
+    ax.grid(axis='y')
+    ax.set_yscale('log')
+    ax.set_xticklabels(models, rotation=45, ha="right")
+# Adding text labels above the bars
+for i, rect in enumerate(bar1):
+    axs[0].text(
+        rect.get_x() + rect.get_width() / 2,
+        rect.get_height(),
+        f'{cpu_time[i]:.2f} s',
+        ha='center',
+        va='bottom')
+for i, rect in enumerate(bar2):
+    axs[1].text(
+        rect.get_x() + rect.get_width() / 2,
+        rect.get_height(),
+        f'{gpu_time[i]:.2f} ms',
+        ha='center',
+        va='bottom')
+for i, rect in enumerate(bar3):
+    axs[2].text(
+        rect.get_x() + rect.get_width() / 2,
+        rect.get_height(),
+        f'{gpu_memory[i]:.3f} GB',
+        ha='center',
+        va='bottom')
+plt.tight_layout()
+plt.show()
+############################################################################################
+# Plot 2: Stretch and Resampler Processing Time Contribution
+############################################################################################
+# Data
+processing_type = ['Stretch (Phase Vocoder)', 'Resampler (Conv1D)']
+cpu_times = [143, 245]  # [Stretch, Resampler] times for CPU in milliseconds
+gpu_times = [6.47, 0.71]  # [Stretch, Resampler] times for GPU in milliseconds
+# Creating subplots
+fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+# Plotting bar charts
+axs[0].bar(processing_type, cpu_times, color=['#ADD8E6', '#98FB98'])
+axs[1].bar(processing_type, gpu_times, color=['#ADD8E6', '#98FB98'])
+# Adding labels and titles
+axs[0].set_ylabel('Time (ms)')
+axs[0].set_title('Contribution of CPU Processing Time: YMT3-PS (BSZ=80)')
+axs[1].set_title('Contribution of GPU Processing Time: YMT3-PS (BSZ=80)')
+# Adding grid for better readability of the plots
+for ax in axs:
+    ax.grid(axis='y')
+    ax.set_yscale('log')  # Log scale to better visualize the smaller values
+# Adding values on top of the bars
+for ax, times in zip(axs, [cpu_times, gpu_times]):
+    for idx, time in enumerate(times):
+        ax.text(idx, time, f"{time:.2f} ms", ha='center', va='bottom', fontsize=8)
+plt.tight_layout()
+plt.show()

amt/src/extras/remove_silence_musicnet_midi.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import glob
+from utils.midi import midi2note
+from utils.note2event import note2note_event
+from utils.note_event_dataclasses import Note
+from utils.note_event_dataclasses import NoteEvent
+from utils.midi import note_event2midi
+data_home = '../../data'
+dataset_name = 'musicnet'
+base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k')
+mid_pattern = os.path.join(base_dir, '*_midi', '*.mid')
+mid_files = glob.glob(mid_pattern, recursive=True)
+for mid_file in mid_files:
+    notes, _ = midi2note(mid_file)
+    first_onset_time = notes[0].onset
+    fixed_notes = []
+    for note in notes:
+        fixed_notes.append(
+            Note(
+                is_drum=note.is_drum,
+                program=note.program,
+                onset=note.onset - first_onset_time,
+                offset=note.offset - first_onset_time,
+                pitch=note.pitch,
+                velocity=note.velocity))
+    assert len(notes) == len(fixed_notes)
+    fixed_note_events = note2note_event(fixed_notes, return_activity=False)
+    note_event2midi(fixed_note_events, mid_file)
+    print(f'Overwriting {mid_file}')

amt/src/extras/rotary_positional_embedding.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""rotary_positional_embedding.py - Rotary Positional Embedding
+code from github.com/lucidrains/rotary-embedding-torch
+MIT License
+"""
+from math import pi, log
+import torch
+from torch import nn, einsum
+from einops import rearrange, repeat
+def exists(val):
+    return val is not None
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)
+               ]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, '... d r -> ... (d r)')
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.):
+    rot_dim, seq_len = freqs.shape[-1], t.shape[-2]
+    freqs = freqs[-seq_len:, :]
+    freqs = freqs.to(t)
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[
+        -1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+    rotations = repeat(rotations, '... n -> ... (n r)', r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(nn.Module):
+    def __init__(self,
+                 dim,
+                 custom_freqs=None,
+                 freqs_for='lang',
+                 theta=10000,
+                 max_freq=10,
+                 num_freqs=1,
+                 learned_freq=False,
+                 use_xpos=False,
+                 xpos_scale_base=512,
+                 interpolate_factor=1.,
+                 theta_rescale_factor=1.):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor**(dim / (dim - 2))
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        self.cache = dict()
+        self.cache_scale = dict()
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.register_buffer('scale', scale)
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) +
+                offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim=-2, offset=0, freq_seq_len=None):
+        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        if exists(freq_seq_len):
+            assert freq_seq_len >= seq_len
+            seq_len = freq_seq_len
+        freqs = self.forward(
+            lambda: self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset),
+            cache_key=f'freqs:{seq_len}|offset:{offset}')
+        return apply_rotary_emb(freqs, t)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=-2):
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, freq_seq_len=k_len)
+        k = self.rotate_queries_or_keys(k, seq_dim=seq_dim)
+        return q, k
+    def rotate_queries_and_keys(self, q, k, seq_dim=-2):
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        freqs = self.forward(lambda: seq, cache_key=f'freqs:{seq_len}')
+        scale = self.get_scale(lambda: seq, cache_key=f'scale:{seq_len}').to(dtype)
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1)
+        return rotated_q, rotated_k
+    def get_scale(self, t, cache_key=None):
+        assert self.use_xpos
+        if exists(cache_key) and cache_key in self.cache:
+            return self.cache[cache_key]
+        if callable(t):
+            t = t()
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale**rearrange(power, 'n -> n 1')
+            scale = torch.cat((scale, scale), dim=-1)
+        if exists(cache_key):
+            self.cache[cache_key] = scale
+        return scale
+    def forward(self, t, cache_key=None):
+        if exists(cache_key) and cache_key in self.cache:
+            return self.cache[cache_key]
+        if callable(t):
+            t = t()
+        freqs = self.freqs
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r=2)
+        if exists(cache_key):
+            self.cache[cache_key] = freqs
+        return freqs

amt/src/extras/run_spleeter_mir1k.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+shopt -s globstar
+for file in "$1"/**/*.wav; do
+    echo $file
+    output_dir="tmp"
+    spleeter separate -b 256k -B tensorflow -p spleeter:2stems -o $output_dir $file -f {instrument}.{codec}
+    sox --ignore-length tmp/accompaniment.wav -r 16000 -c 1 -b 16 tmp/accompaniment_16k.wav
+    sox --ignore-length tmp/vocals.wav -r 16000 -c 1 -b 16 tmp/vocals_16k.wav
+    acc_file="${file//.wav/_accompaniment.wav}"
+    voc_file="${file//.wav/_vocals.wav}"
+    mv -f "tmp/accompaniment_16k.wav" $acc_file
+    mv -f "tmp/vocals_16k.wav" $voc_file
+    echo $acc_file
+    echo $voc_file
+    rm -rf tmp
+done
+rm -rf pretrained_models

amt/src/extras/run_spleeter_mirst500.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/bash
+shopt -s globstar
+for file in "$1"/**/*.wav; do
+    output_dir="${file%/*}"
+    input_file="$output_dir/converted_Mixture.wav"
+    spleeter separate -p spleeter:2stems -o $output_dir $input_file -f {instrument}.{codec}
+    ffmpeg -i "$output_dir/vocals.wav" -acodec pcm_s16le -ac 1 -ar 16000 -y "$output_dir/vocals_16k.wav"
+    ffmpeg -i "$output_dir/accompaniment.wav" -acodec pcm_s16le -ac 1 -ar 16000 -y "$output_dir/accompaniment_16k.wav"
+    rm "$output_dir/vocals.wav"
+    rm "$output_dir/accompaniment.wav"
+    mv "$output_dir/vocals_16k.wav" "$output_dir/vocals.wav"
+    mv "$output_dir/accompaniment_16k.wav" "$output_dir/accompaniment.wav"
+done

amt/src/extras/run_spleeter_mirst500_cmedia.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/bash
+shopt -s globstar
+for file in "$1"/**/*.wav; do
+    output_dir="${file%/*}"
+    input_file="$output_dir/converted_Mixture.wav"
+    spleeter separate -p spleeter:2stems -o $output_dir $input_file -f {instrument}.{codec}
+    ffmpeg -i "$output_dir/vocals.wav" -acodec pcm_s16le -ac 1 -ar 16000 -y "$output_dir/vocals_16k.wav"
+    ffmpeg -i "$output_dir/accompaniment.wav" -acodec pcm_s16le -ac 1 -ar 16000 -y "$output_dir/accompaniment_16k.wav"
+    rm "$output_dir/vocals.wav"
+    rm "$output_dir/accompaniment.wav"
+    mv "$output_dir/vocals_16k.wav" "$output_dir/vocals.wav"
+    mv "$output_dir/accompaniment_16k.wav" "$output_dir/accompaniment.wav"
+done

amt/src/extras/swap_channel.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import numpy as np
+a = np.arange(12).reshape(2, 3, 2)  # (batch, channel, dim)
+print(a)
+array([[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]]])
+swap_mat = create_swap_channel_mat(input_shape, swap_channel=(1, 2))
+# will swap channel 1 and 2 of batch 0 with channel 1 and 2 of batch 1
+b = a @ swap_mat
+print(b)
+# expected output
+array([[[0, 1], [8, 9], [10, 11]], [[6, 7], [2, 3], [4, 5]]])
+import torch
+def swap_channels_between_batches(a_tensor, swap_channels):
+    # Copy the tensor to avoid modifying the original tensor
+    result_tensor = a_tensor.clone()
+    # Unpack the channels to be swapped
+    ch1, ch2 = swap_channels
+    # Swap the specified channels between batches
+    result_tensor[0, ch1, :], result_tensor[1, ch1, :] = a_tensor[1, ch1, :].clone(), a_tensor[0, ch1, :].clone()
+    result_tensor[0, ch2, :], result_tensor[1, ch2, :] = a_tensor[1, ch2, :].clone(), a_tensor[0, ch2, :].clone()
+    return result_tensor
+# Define a sample tensor 'a_tensor'
+a_tensor = torch.tensor([[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]]], dtype=torch.float32)
+# Define channels to swap
+swap_channels = (1, 2)  # Channels to swap between batches
+# Swap the channels between batches
+swapped_tensor = swap_channels_between_batches(a_tensor, swap_channels)
+# Print the original tensor and the tensor after swapping channels between batches
+print("Original Tensor 'a_tensor':")
+print(a_tensor)
+print("\nTensor after swapping channels between batches:")
+print(swapped_tensor)
+#-------------------------------------------------
+import torch
+from einops import rearrange
+def shift(arr, num, fill_value=np.nan):
+    result = np.empty_like(arr)
+    if num > 0:
+        result[:num] = fill_value
+        result[num:] = arr[:-num]
+    elif num < 0:
+        result[num:] = fill_value
+        result[:num] = arr[-num:]
+    else:
+        result[:] = arr
+    return result
+def create_batch_swap_matrix(batch_size, channels, swap_channels):
+    swap_mat = np.eye(batch_size * channels)
+    for c in swap_channels:
+        idx1 = c  # 첫 번째 배치의 교환할 채널 인덱스
+        idx2 = c + channels  # 두 번째 배치의 교환할 채널 인덱스
+        swap_mat[idx1, idx1], swap_mat[idx2, idx2] = 0, 0  # 대각선 값을 0으로 설정
+        swap_mat[idx1, idx2], swap_mat[idx2, idx1] = 1, 1  # 해당 채널을 교환
+    return swap_mat
+def create_batch_swap_matrix(batch_size, channels, swap_channels):
+    swap_mat = np.eye(batch_size * channels)
+    # 모든 채널에 대해 교환 수행
+    for c in swap_channels:
+        idx1 = np.arange(c, batch_size * channels, channels)  # 현재 채널의 모든 배치 인덱스
+        idx2 = (idx1 + channels) % (batch_size * channels)  # 순환을 위해 modulo 사용
+        swap_mat[idx1, idx1] = 0
+        swap_mat[idx2, idx2] = 0
+        swap_mat[idx1, idx2] = 1
+        swap_mat[idx2, idx1] = 1
+    return swap_mat
+def swap_channels_between_batches(input_tensor, swap_matrix):
+    reshaped_tensor = rearrange(input_tensor, 'b c d -> (b c) d')
+    swapped_tensor = swap_matrix @ reshaped_tensor
+    return rearrange(swapped_tensor, '(b c) d -> b c d', b=input_tensor.shape[0])
+# 예제 파라미터
+batch_size = 2
+channels = 3
+# swap_info  = {
+#     : [1, 2] # batch_index: [channel_indices]
+# }
+swap_channels = [1, 2]  # 교환할 채널
+# 예제 텐서 생성
+input_tensor = torch.tensor([[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]]], dtype=torch.float32)
+# swap matrix 생성
+swap_matrix = create_batch_swap_matrix(batch_size, channels, swap_channels)
+swap_matrix = torch.Tensor(swap_matrix)
+# 채널 교환 수행
+swapped_tensor = swap_channels_between_batches(input_tensor, swap_matrix)
+# 결과 출력
+print("Original Tensor:")
+print(input_tensor)
+print("\nSwapped Tensor:")
+print(swapped_tensor)

amt/src/extras/t5_dev.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from transformers import T5Config
+from model.t5mod import T5ForConditionalGeneration
+a = {
+    "architectures": ["T5ForConditionalGeneration"],
+    "d_ff": 1024,  # size of the intermediate feed forward layer in each T5Block
+    "d_kv": 64,  # d_kv has to be equal to d_model // num_heads.
+    # "d_model": 512,  # encoder hiddnen size, defined by model_cfg
+    "decoder_start_token_id": 0,
+    "dense_act_fn": "gelu_new",
+    # "dropout_rate": 0.05,  # can be overwritten by args in ymt3
+    "eos_token_id": 1,
+    "feed_forward_proj": "gated-gelu",
+    "initializer_factor": 1.0,
+    # "is_encoder_decoder": True,
+    "is_gated_act": True,
+    "layer_norm_epsilon": 1e-06,
+    "model_type": "t5",
+    # "num_decoder_layers": 8,
+    "num_heads": 6,
+    "num_layers": 8,
+    "output_past": True,
+    "pad_token_id": 0,
+    "relative_attention_num_buckets": 32,
+    "use_cache": True,
+    "vocab_size": 1391  # vocab_size is automatically set by the task manager...
+}
+cfg = T5Config(**a)
+cfg.num_decoder_layers = 4
+cfg.num_layers = 0
+model = T5ForConditionalGeneration(cfg)
+print(model)
+x = torch.rand(((2, 256, 512)))
+out = model.encoder.forward(inputs_embeds=x)
+enc_hs = torch.rand((2, 256, 512))
+labels = torch.randint(0, 1391, (2, 256))
+pred = model(encoder_outputs=(enc_hs,), labels=labels)  # important (enc_hs,) comma!

amt/src/extras/t5perceiver.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+""" Bare wrapper of HF PyTorch T5 and Perceiver with the following modifications:
+- PerceiverTF encoder
+- ResConv pre-encoder
+- Projection layers for dynamic dimension matching
+- Sinusoidal absolute positional embeddings
+- Positional embeddings from Perceiver implementation
+- Task conditioning on encoder and decoder by input tokens
+"""
+import copy
+import warnings
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+from transformers.utils import logging
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.t5.modeling_t5 import (T5LayerNorm, T5Block, PARALLELIZE_DOCSTRING, DEPARALLELIZE_DOCSTRING,
+                                                T5_START_DOCSTRING, T5_INPUTS_DOCSTRING, _CONFIG_FOR_DOC,
+                                                __HEAD_MASK_WARNING_MSG)
+from transformers.modeling_outputs import (Seq2SeqLMOutput, BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions)
+from transformers import T5Config  #, T5PreTrainedModel
+from model.ops import FixedSinusoidalPositionalEmbedding
+# additional imports
+from model.t5mod import T5Stack
+from transformers.models.t5.modeling_t5 import (T5Model, T5ForConditionalGeneration, T5EncoderModel, T5DenseActDense,
+                                                T5DenseGatedActDense, T5Attention, load_tf_weights_in_t5,
+                                                is_torch_fx_proxy)
+from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK)
+logger = logging.get_logger(__name__)
+class T5PerceiverPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = None
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["T5Block"]
+    _keep_in_fp32_modules = ["wo"]
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model)**-0.5))
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+        assert decoder_start_token_id is not None, (
+            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+            " See T5 docs for more information")
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+        return shifted_input_ids
+class T5PerceiverForConditionalGeneration(T5PerceiverPreTrainedModel):
+    config_class = None
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["T5Block"]
+    _keep_in_fp32_modules = ["wo"]
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+    def __init__(
+        self,
+        model_cfg: dict,
+        #  config: T5Config,
+        #  use_fixed_absolute_pe: bool = True,
+        #  num_max_positions: int = 1025
+    ):
+        super().__init__(config)
+        self.model_dim = config.d_model
+        """ mod: absolute position embedding """
+        self.use_fixed_absolute_pe = use_fixed_absolute_pe
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config,
+                               self.shared,
+                               use_fixed_absolute_pe=use_fixed_absolute_pe,
+                               num_max_positions=num_max_positions)
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config,
+                               self.shared,
+                               use_fixed_absolute_pe=use_fixed_absolute_pe,
+                               num_max_positions=num_max_positions)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, T5ForConditionalGeneration
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = decoder_outputs[0]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+        lm_logits = self.lm_head(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past_key_values
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (layer_past_state.index_select(
+                    0, beam_idx.to(layer_past_state.device)),)
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers import AutoModel, AutoConfig
+class MyConfig(T5Config, PerceiverConfig):
+    model_type = 'mymodel'
+    def __init__(self, important_param=42, **kwargs):
+        super().__init__(**kwargs)
+        self.important_param = important_param

amt/src/extras/unimax_sampler/README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+# UniMax Language Dataset Sampler with DDP support
+This repository contains an unofficial implementation of the UNIMAX sampling algorithm using PyTorch. The UNIMAX algorithm ["UniMax: Fairer and more Effective Language Sampling for Large-Scale Multilingual Pretraining" by HW Chung et al. (ICLR 2023)](https://arxiv.org/abs/2304.09151) is used to generate a sampling distribution of languages based on their character counts, a total character budget, and a specified number of epochs per language. This can be useful for training language models on datasets with imbalanced language distribution.
+## Contents
+1. `unimax_sampler.py`: This Python file contains the `UnimaxSampler` class, a PyTorch `Sampler` that uses the UNIMAX algorithm.
+2. `test_unimax_sampler.py`: This Python file contains a unit test for the `UnimaxSampler` class to ensure its correct functionality.
+## Usage
+```python
+from torch.utils.data import Dataset, DataLoader
+from unimax_sampler import UnimaxSampler
+# Define your parameters
+language_character_counts = [100, 200, 300, 400, 500]
+total_character_budget = 1000
+num_epochs = 2
+# Create the UnimaxSampler
+unimax_sampler = UnimaxSampler(language_character_counts, total_character_budget, num_epochs)
+```
+Then, use the sampler as the sampler argument when creating a DataLoader.
+```python
+# Disable shuffle when using custom sampler...
+data_loader = DataLoader(my_dataset, batch_size=2, shuffle=None, sampler=unimax_sampler)
+```
+For DDP,
+```python
+if torch.distributed.is_initialized():
+    sampler = DistributedUnimaxSampler(...)
+else:
+    return unimax_sampler(...)
+```
+## Note
+The initial version of this code was created by [Chat GPT-4](https://chat.openai.com/), based on the pseudocode provided in the [UNIMAX](https://arxiv.org/abs/2304.09151) paper. Subsequently, the code was manually revised for `PyTorch` Distributed Data Parallel ([DDP](https://pytorch.org/docs/stable/notes/ddp.html)) framework. The DistributedSamplerWrapper implementation is derived from an earlier version found in the [Catalyst](https://github.com/catalyst-team/catalyst) project.
+## License
+This project is licensed under the MIT License.

amt/src/extras/unimax_sampler/demo.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from utils.unimax_sampler.unimax_sampler import UnimaxSampler
+language_character_counts = [100, 200, 300, 400, 500]
+total_character_budget = 1000
+num_epochs = 2
+# Create the UnimaxSampler.
+sampler = UnimaxSampler(language_character_counts, total_character_budget, num_epochs)
+# Define the expected output. This will depend on your specific implementation of Unimax.
+expected_output = torch.tensor([0.1, 0.2, 0.3, 0.2, 0.2])
+# Use PyTorch's allclose function to compare the computed and expected outputs.
+# The absolute tolerance parameter atol specifies the maximum difference allowed for the test to pass.
+self.assertTrue(torch.allclose(sampler.p, expected_output, atol=1e-6))

amt/src/extras/unimax_sampler/unimax_sampler.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+from torch.utils.data import DistributedSampler
+from torch.utils.data import Dataset, Sampler
+from torch.utils.data import RandomSampler
+from operator import itemgetter
+from typing import List, Union, Iterator, Optional
+class DatasetFromSampler(Dataset):
+    """Dataset to create indexes from `Sampler`. From catalyst library.
+    Args:
+        sampler: PyTorch sampler
+    """
+    def __init__(self, sampler: Sampler):
+        """Initialisation for DatasetFromSampler."""
+        self.sampler = sampler
+        self.sampler_list = None
+    def __getitem__(self, index: int):
+        """Gets element of the dataset.
+        Args:
+            index: index of the element in the dataset
+        Returns:
+            Single element by index
+        """
+        if self.sampler_list is None:
+            self.sampler_list = list(self.sampler)
+        return self.sampler_list[index]
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: length of the dataset
+        """
+        return len(self.sampler)
+class DistributedSamplerWrapper(DistributedSampler):
+    """
+    Wrapper over `Sampler` for distributed training.
+    Allows you to use any sampler in distributed mode.
+    From https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py
+    It is especially useful in conjunction with
+    `torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSamplerWrapper instance as a DataLoader
+    sampler, and load a subset of subsampled data of the original dataset
+    that is exclusive to it.
+    .. note::
+        Sampler is assumed to be of constant size.
+    """
+    def __init__(
+        self,
+        sampler,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+    ):
+        """
+        Args:
+            sampler: Sampler used for subsampling
+            num_replicas (int, optional): Number of processes participating in
+                distributed training
+            rank (int, optional): Rank of the current process
+                within ``num_replicas``
+            shuffle (bool, optional): If true (default),
+                sampler will shuffle the indices
+        """
+        super(DistributedSamplerWrapper, self).__init__(
+            DatasetFromSampler(sampler),
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+        )
+        self.sampler = sampler
+    def __iter__(self) -> Iterator[int]:
+        """Iterate over sampler.
+        Returns:
+            python iterator
+        """
+        self.dataset = DatasetFromSampler(self.sampler)
+        indexes_of_indexes = super().__iter__()
+        subsampler_indexes = self.dataset
+        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))
+class UnimaxSampler(Sampler):
+    # Initialize the sampler with the character counts for each language,
+    # the total character budget, and the number of epochs per language.
+    def __init__(self, language_character_counts: List[int], total_character_budget: int,
+                 num_epochs: int) -> None:
+        self.language_character_counts = torch.tensor(language_character_counts)
+        self.total_character_budget = total_character_budget
+        self.num_epochs = num_epochs
+        # Compute the sampling distribution p.
+        self.p = self._unimax()
+    # Define how to iterate over the data. We'll use PyTorch's multinomial
+    # function to generate indices according to the distribution p.
+    def __iter__(self) -> iter:
+        return iter(torch.multinomial(self.p, len(self.p), replacement=True).tolist())
+    # Define the length of the sampler as the number of languages.
+    def __len__(self) -> int:
+        return len(self.p)
+    # Implement the UNIMAX algorithm to compute the sampling distribution p.
+    def _unimax(self) -> torch.Tensor:
+        # Sort languages by character count.
+        L, indices = torch.sort(self.language_character_counts)
+        # Initialize the remaining budget to the total character budget.
+        B = float(self.total_character_budget)
+        i = 0
+        # Initialize the budget per language.
+        U = torch.zeros_like(L)
+        # For each language...
+        for idx in indices:
+            # Compute the remaining budget per-language.
+            bl = B / (len(L) - i)
+            cl = L[idx]
+            # If per-language budget exceeds N epochs of the language, use N epochs.
+            if bl > cl * self.num_epochs:
+                Ul = cl * self.num_epochs
+            # Otherwise use uniform per-language budget.
+            else:
+                Ul = bl
+            # Store the computed budget.
+            U[idx] = Ul
+            # Update the remaining budget.
+            B -= Ul
+            # Move to the next language.
+            i += 1
+        # Normalize the budget to create a distribution.
+        p = U / U.sum()
+        # Return the computed distribution.
+        return p
+class DistributedUnimaxSampler(UnimaxSampler):
+    def __init__(self,
+                 language_character_counts: List[int],
+                 total_character_budget: int,
+                 num_epochs: int,
+                 num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None,
+                 shuffle: bool = True) -> None:
+        super().__init__(language_character_counts, total_character_budget, num_epochs)
+        self.distributed_sampler = DistributedSamplerWrapper(self, num_replicas, rank, shuffle)
+    def __iter__(self):
+        return iter(self.distributed_sampler)
+    def __len__(self):
+        return len(self.distributed_sampler)
+    def set_epoch(self, epoch):
+        self.distributed_sampler.set_epoch(epoch)

amt/src/install_dataset.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+""" install_dataset.py """
+import os
+import argparse
+import mirdata
+from typing import Optional, Tuple, Union
+from utils.preprocess.generate_dataset_stats import generate_dataset_stats_for_all_datasets, update_dataset_stats_for_new_dataset
+from utils.mirdata_dev.datasets import slakh16k
+from utils.preprocess.preprocess_slakh import preprocess_slakh16k, add_program_and_is_drum_info_to_file_list
+from utils.preprocess.preprocess_musicnet import preprocess_musicnet16k
+from utils.preprocess.preprocess_maps import preprocess_maps16k
+from utils.preprocess.preprocess_maestro import preprocess_maestro16k
+from utils.preprocess.preprocess_guitarset import preprocess_guitarset16k, create_filelist_by_style_guitarset16k
+from utils.preprocess.preprocess_enstdrums import preprocess_enstdrums16k, create_filelist_dtm_random_enstdrums16k
+from utils.preprocess.preprocess_mir_st500 import preprocess_mir_st500_16k
+from utils.preprocess.preprocess_cmedia import preprocess_cmedia_16k
+from utils.preprocess.preprocess_rwc_pop_full import preprocess_rwc_pop_full16k
+from utils.preprocess.preprocess_rwc_pop import preprocess_rwc_pop16k
+from utils.preprocess.preprocess_egmd import preprocess_egmd16k
+from utils.preprocess.preprocess_mir1k import preprocess_mir1k_16k
+from utils.preprocess.preprocess_urmp import preprocess_urmp16k
+from utils.preprocess.preprocess_idmt_smt_bass import preprocess_idmt_smt_bass_16k
+from utils.preprocess.preprocess_geerdes import preprocess_geerdes16k
+from utils.utils import download_and_extract  #, download_and_extract_zenodo_restricted
+# zenodo_token = "eyJhbGciOiJIUzUxMiIsImlhdCI6MTcxMDE1MDYzNywiZXhwIjoxNzEyNzA3MTk5fQ.eyJpZCI6ImRmODA5NzZlLTBjM2QtNDk5NS05YjM0LWFiNGM4NzJhMmZhMSIsImRhdGEiOnt9LCJyYW5kb20iOiIwMzY5ZDcxZjc2NTMyN2UyYmVmN2ExYjJkMmMyYTRhNSJ9.0aHnNC-7ivWQO6l8twjLR0NDH4boC0uOolAAmogVt7XRi2PHU5MEKBQoK7-wgDdnmWEIqEIvoLO6p8KTnsY9dg"
+def install_slakh(data_home=os.PathLike, no_down=False) -> None:
+    if not no_down:
+        ds = slakh16k.Dataset(data_home, version='2100-yourmt3-16k')
+        ds.download(partial_download=['2100-yourmt3-16k', 'index'])
+        del (ds)
+    preprocess_slakh16k(data_home, delete_source_files=False, fix_bass_octave=True)
+    add_program_and_is_drum_info_to_file_list(data_home)
+def install_musicnet(data_home=os.PathLike, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7811639/files/musicnet_yourmt3_16k.tar.gz?download=1"
+        checksum = "a2da7c169e26d452a4e8b9bef498b3d7"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_musicnet16k(data_home, dataset_name='musicnet')
+def install_maps(data_home=os.PathLike, no_down=False, sanity_check=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7812075/files/maps_yourmt3_16k.tar.gz?download=1"
+        checksum = "6b070d162c931cd5e69c16ef2398a649"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_maps16k(data_home, dataset_name='maps', ignore_pedal=False, sanity_check=sanity_check)
+def install_maestro(data_home=os.PathLike, no_down=False, sanity_check=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7852176/files/maestro_yourmt3_16k.tar.gz?download=1"
+        checksum = "c17c6a188d936e5ff3870ef27144d397"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_maestro16k(data_home, dataset_name='maestro', ignore_pedal=False, sanity_check=sanity_check)
+def install_guitarset(data_home=os.PathLike, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7831843/files/guitarset_yourmt3_16k.tar.gz?download=1"
+        checksum = "e3cfe0cc9394d91d9c290ce888821360"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_guitarset16k(data_home, dataset_name='guitarset')
+    create_filelist_by_style_guitarset16k(data_home, dataset_name='guitarset')
+def install_enstdrums(data_home, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7831843/files/enstdrums_yourmt3_16k.tar.gz?download=1"
+        checksum = "7e28c2a923e4f4162b3d83877cedb5eb"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_enstdrums16k(data_home, dataset_name='enstdrums')
+    create_filelist_dtm_random_enstdrums16k(data_home, dataset_name='enstdrums')
+def install_egmd(data_home, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7831072/files/egmc_yourmt3_16k.tar.gz?download=1"
+        checksum = "4f615157ea4c52a64c6c9dcf68bf2bde"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_egmd16k(data_home, dataset_name='egmd')
+def install_mirst500(data_home, zenodo_token, no_down=False, sanity_check=True, apply_correction=False) -> None:
+    """ Update Oct 2023: MIR-ST500 with FULL audio files"""
+    if not no_down:
+        url = "https://zenodo.org/records/10016397/files/mir_st500_yourmt3_16k.tar.gz?download=1"
+        checksum = "98eb52eb2456ce4034e21750f309da13"
+        download_and_extract(data_home, url, check_sum=checksum, zenodo_token=zenodo_token)
+    preprocess_mir_st500_16k(data_home, dataset_name='mir_st500', sanity_check=sanity_check)
+def install_cmedia(data_home, zenodo_token, no_down=False, sanity_check=True) -> None:
+    if not no_down:
+        url = "https://zenodo.org/records/10016397/files/cmedia_yourmt3_16k.tar.gz?download=1"
+        checksum = "e6cca23577ba7588e9ed9711a398f7cf"
+        download_and_extract(data_home, url, check_sum=checksum, zenodo_token=zenodo_token)
+    preprocess_cmedia_16k(data_home, dataset_name='cmedia', sanity_check=sanity_check, apply_correction=True)
+def install_rwc_pop(data_home, zenodo_token, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/records/10016397/files/rwc_pop_yourmt3_16k.tar.gz?download=1"
+        checksum = "ad459f9fa1b6b87676b2fb37c0ba5dfc"
+        download_and_extract(data_home, url, check_sum=checksum, zenodo_token=zenodo_token)
+    preprocess_rwc_pop16k(data_home, dataset_name='rwc_pop')  # bass transcriptions
+    preprocess_rwc_pop_full16k(data_home, dataset_name='rwc_pop')  # full transcriptions
+def install_mir1k(data_home, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/7955481/files/mir1k_yourmt3_16k.tar.gz?download=1"
+        checksum = "4cbac56a4e971432ca807efd5cb76d67"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    # preprocess_mir1k_16k(data_home, dataset_name='mir1k')
+def install_urmp(data_home, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/record/8021437/files/urmp_yourmt3_16k.tar.gz?download=1"
+        checksum = "4f539c71678a77ba34f6dfca41072102"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_urmp16k(data_home, dataset_name='urmp')
+def install_idmt_smt_bass(data_home, no_down=False) -> None:
+    if not no_down:
+        url = "https://zenodo.org/records/10009959/files/idmt_smt_bass_yourmt3_16k.tar.gz?download=1"
+        checksum = "0c95f91926a1e95b1f5d075c05b7eb76"
+        download_and_extract(data_home, url, remove_tar_file=True, check_sum=checksum)
+    preprocess_idmt_smt_bass_16k(data_home, dataset_name='idmt_smt_bass', sanity_check=True,
+                                 edit_audio=False)  # the donwloaded audio has already been edited
+def install_random_nsynth(data_home, no_down=False) -> None:
+    return
+def install_geerdes(data_home) -> None:
+    try:
+        preprocess_geerdes16k(data_home, dataset_name='geerdes', sanity_check=False)
+    except Exception as e:
+        print(e)
+        print("Geerdes dataset is not available for download. Please contact the dataset provider.")
+def regenerate_dataset_stats(data_home) -> None:
+    generate_dataset_stats_for_all_datasets(data_home)
+def get_cached_zenodo_token() -> str:
+    # check if cached token exists
+    if not os.path.exists('.cached_zenodo_token'):
+        raise Exception("Cached Zenodo token not found. Please enter your Zenodo token.")
+    # read cached token
+    with open('.cached_zenodo_token', 'r') as f:
+        zenodo_token = f.read().strip()
+        print(f"Using cached Zenodo token: {zenodo_token}")
+    return zenodo_token
+def cache_zenodo_token(zenodo_token: str) -> None:
+    with open('.cached_zenodo_token', 'w') as f:
+        f.write(zenodo_token)
+    print("Your Zenodo token is cached.")
+def option_prompt(data_home: os.PathLike, no_download: bool = False) -> None:
+    print("Select the dataset(s) to install (enter comma-separated numbers):")
+    print("1. Slakh")
+    print("2. MusicNet")
+    print("3. MAPS")
+    print("4. Maestro")
+    print("5. GuitarSet")
+    print("6. ENST-drums")
+    print("7. EGMD")
+    print("8. MIR-ST500 ** Restricted Access **")
+    print("9. CMedia ** Restricted Access **")
+    print("10. RWC-Pop (Bass and Full) ** Restricted Access **")
+    print("11. MIR-1K (NOT SUPPORTED)")
+    print("12. URMP")
+    print("13. IDMT-SMT-Bass")
+    print("14. Random-NSynth")
+    print("15. Geerdes")
+    print("16. Regenerate Dataset Stats (experimental)")
+    print("17. Request Token for ** Restricted Access **")
+    print("18. Exit")
+    choice = input("Enter your choices (multiple choices with comma): ")
+    choices = [c.strip() for c in choice.split(',')]
+    if "18" in choices:
+        print("Exiting.")
+    else:
+        # ask for Zenodo token
+        for c in choices:
+            if int(c) in [8, 9, 10]:
+                if no_download is True:
+                    zenodo_token = None
+                else:
+                    zenodo_token = input("Enter Zenodo token, or press enter to use the cached token:")
+                    if zenodo_token == "":
+                        zenodo_token = get_cached_zenodo_token()
+                    else:
+                        cache_zenodo_token(zenodo_token)
+                    break
+        if "1" in choices:
+            install_slakh(data_home, no_down=no_download)
+        if "2" in choices:
+            install_musicnet(data_home, no_down=no_download)
+        if "3" in choices:
+            install_maps(data_home, no_down=no_download)
+        if "4" in choices:
+            install_maestro(data_home, no_down=no_download)
+        if "5" in choices:
+            install_guitarset(data_home, no_down=no_download)
+        if "6" in choices:
+            install_enstdrums(data_home, no_down=no_download)
+        if "7" in choices:
+            install_egmd(data_home, no_down=no_download)
+        if "8" in choices:
+            install_mirst500(data_home, zenodo_token, no_down=no_download)
+        if "9" in choices:
+            install_cmedia(data_home, zenodo_token, no_down=no_download)
+        if "10" in choices:
+            install_rwc_pop(data_home, zenodo_token, no_down=no_download)
+        if "11" in choices:
+            install_mir1k(data_home, no_down=no_download)
+        if "12" in choices:
+            install_urmp(data_home, no_down=no_download)
+        if "13" in choices:
+            install_idmt_smt_bass(data_home, no_down=no_download)
+        if "14" in choices:
+            install_random_nsynth(data_home, no_down=no_download)
+        if "15" in choices:
+            install_geerdes(data_home)  # not available for download
+        if "16" in choices:
+            regenerate_dataset_stats(data_home, no_down=no_download)
+        if "17" in choices:
+            print("\nPlease visit https://zenodo.org/records/10016397 to request a Zenodo token.")
+            print("Upon submitting your request, you will receive an email with a link labeled 'Access the record'.")
+            print("Copy the token that follows 'token=' in that link.")
+        if not any(int(c) in range(16) for c in choices):
+            print("Invalid choice(s). Please enter valid numbers separated by commas.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Dataset installer script.')
+    # data home dir
+    parser.add_argument(
+        'data_home',
+        type=str,
+        nargs='?',
+        default=None,
+        help='Path to data home directory. If None, use the default path defined in src/config/config.py')
+    # `no_download` option
+    parser.add_argument('--nodown',
+                        '-nd',
+                        action='store_true',
+                        help='Flag to control downloading. If set, no downloading will occur.')
+    args = parser.parse_args()
+    if args.data_home is None:
+        from config.config import shared_cfg
+        data_home = shared_cfg["PATH"]["data_home"]
+    else:
+        data_home = args.data_home
+    os.makedirs(data_home, exist_ok=True)
+    no_download = args.nodown
+    option_prompt(data_home, no_download)

amt/src/model/RoPE/RoPE.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""rotary_embedding.py - Rotary Embedding based on https://github.com/lucidrains/rotary-embedding-torch"""
+from typing import Literal, Union, Optional
+from math import pi, log
+from einops import rearrange, repeat
+import torch
+from torch.nn import Module, ModuleList
+from torch.cuda.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, '... d r -> ... (d r)')
+@autocast(enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1., seq_dim=-2):
+    """Applies rotary embedding for pixels."""
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:].to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[
+        -1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim=-1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+    rotations = repeat(rotations, '... n -> ... (n r)', r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(Module):
+    def __init__(self,
+                 dim,
+                 custom_freqs: Optional[Tensor] = None,
+                 freqs_for: Union[Literal['lang'], Literal['pixel'], Literal['constant']] = 'lang',
+                 theta=10000,
+                 max_freq=10,
+                 num_freqs=1,
+                 learned_freq=False,
+                 use_xpos=False,
+                 xpos_scale_base=512,
+                 interpolate_factor=1.,
+                 theta_rescale_factor=1.,
+                 seq_before_head_dim=False,
+                 cache_if_possible=True):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor**(dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        self.cache_if_possible = cache_if_possible
+        self.tmp_store('cached_freqs', None)
+        self.tmp_store('cached_scales', None)
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.tmp_store('dummy', torch.tensor(0))
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            self.tmp_store('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store('scale', scale)
+    @property
+    def device(self):
+        return self.dummy.device
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0, freq_seq_len=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        if exists(freq_seq_len):
+            assert freq_seq_len >= seq_len
+            seq_len = freq_seq_len
+        freqs = self.forward(self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset),
+                             seq_len=seq_len,
+                             offset=offset)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, freq_seq_len=k_len)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
+        assert self.use_xpos
+        should_cache = (self.cache_if_possible and exists(seq_len))
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale**rearrange(power, 'n -> n 1')
+            scale = torch.cat((scale, scale), dim=-1)
+        if should_cache:
+            self.tmp_store('cached_scales', scale)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+            freqs = self.forward(pos, seq_len=dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+    @autocast(enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible and \
+            not self.learned_freq and \
+            exists(seq_len) and \
+            self.freqs_for != 'pixel'
+        )
+        if (
+            should_cache and \
+            exists(self.cached_freqs) and \
+            (offset + seq_len) <= self.cached_freqs.shape[0]
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+        freqs = self.freqs
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r=2)
+        if should_cache:
+            self.tmp_store('cached_freqs', freqs.detach())
+        return freqs
+    # custom method for applying rotary embeddings
+    @torch.compiler.disable
+    def apply_rotary_custom(self, t: torch.Tensor):
+        """Apply rotary embeddings to queries and keys, if k is None, only q is rotated.
+           Depending on the freqs type, the rotation will be different."""
+        if self.freqs_for == 'lang':
+            return self.rotate_queries_or_keys(t, seq_dim=-2)
+        elif self.freqs_for == 'pixel':
+            return apply_rotary_emb(self.get_axial_freqs(t.shape[-2]), t)
+        else:
+            raise ValueError(f"freqs_for must be 'lang' or 'pixel', but got {self.freqs_for}")
+def test_rotary_embedding_lang():
+    d = 32  # d by head
+    q = torch.ones(1, 4, 110, 32)  # (B, H, T, D) for multi-head attention
+    rdim = d // 2  # will do a partial rotation on half, or d
+    rotary = RotaryEmbedding(dim=rdim, freqs_for="lang")
+    q = rotary.rotate_queries_or_keys(q, seq_dim=-2)
+    # visualize
+    import matplotlib.pyplot as plt
+    plt.imshow(q[0, 0, :, :].numpy().T, origin='lower')
+def test_rotary_embedding_pixel():
+    d = 32  # d by head
+    q = torch.ones(1, 4, 128, 32)  # (B*T, H, F, C/H) for multi-head attention
+    rdim = d // 2  # will do a partial rotation on half
+    rotary = RotaryEmbedding(dim=rdim, freqs_for="pixel", max_freq=10)
+    freqs = rotary.get_axial_freqs(128)
+    q = apply_rotary_emb(freqs, q)  # also k, if needed
+    # visualize
+    import matplotlib.pyplot as plt
+    plt.imshow(q[0, 0, :, :].numpy().T, origin='lower')

amt/src/model/conformer_helper.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+import math
+from typing import Optional, Union
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+class ConformerYMT3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConformerYMT3Encoder`]. It is used to
+    instantiate an ConformerYMT3Encoder according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Conformer
+    [facebook/wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        d_model (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        dropout_rate (`float`, *optional*, defaults to 0.05):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        position_encoding_type (`str`, *optional*, defaults to `"relative"`):
+            Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left
+            `None` no relative position embedding is applied.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base.
+        num_max_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        conv_depthwise_kernel_size (`int`, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
+    Example:
+    ```python
+    >>> from transformers import ConformerYMT3Config, ConformerYMT3Encoder
+    >>> # Initializing a ConformerYMT3Encoder configuration
+    >>> configuration = ConformerYMT3Config()
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-conformer-rel-pos-large style configuration
+    >>> model = ConformerYMT3Encoder(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "conformer-ymt3"
+    def __init__(
+        self,
+        d_model=512,  # 768
+        num_layers=8,  # ConformerYMT3Encoder
+        num_heads=8,  # ConformerYMT3SelfAttention
+        intermediate_size=2048,  # 3072,# used in intermediate_dense of ConformerYMT3FeedForward
+        hidden_act="gelu",  # used in intermediate_act_fn of ConformerYMT3FeedForward
+        dropout_rate=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 3, 3),
+        conv_bias=False,
+        position_encoding_type="rotary",
+        rotary_embedding_base=10000,
+        num_max_positions=1024,
+        conv_depthwise_kernel_size=31,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_layers = num_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_heads = num_heads
+        self.dropout_rate = dropout_rate
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.num_max_positions = num_max_positions
+        self.position_encoding_type = position_encoding_type
+        self.rotary_embedding_base = rotary_embedding_base
+        # Conformer-block related
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+class ConformerYMT3PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = ConformerYMT3Config
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if module.__class__.__name__ == "ConformerYMT3SelfAttention":
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if module.__class__.__name__ == "ConformerYMT3Encoder":
+            module.gradient_checkpointing = value

amt/src/model/conformer_mod.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+from typing import Tuple, Literal, Any, Optional
+import math
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from model.conformer_helper import ConformerYMT3Config, ConformerYMT3PreTrainedModel
+from model.positional_encoding import (Wav2Vec2ConformerRelPositionalEmbedding,
+                                       Wav2Vec2ConformerRotaryPositionalEmbedding)
+class ConformerYMT3FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.dropout_rate)
+        self.intermediate_dense = nn.Linear(config.d_model, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.output_dense = nn.Linear(config.intermediate_size, config.d_model)
+        self.output_dropout = nn.Dropout(config.dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+class ConformerYMT3ConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.pointwise_conv1 = torch.nn.Conv1d(
+            config.d_model,
+            2 * config.d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = torch.nn.GLU(dim=1)
+        self.depthwise_conv = torch.nn.Conv1d(
+            config.d_model,
+            config.d_model,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.d_model,
+            bias=False,
+        )
+        self.batch_norm = torch.nn.BatchNorm1d(config.d_model)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = torch.nn.Conv1d(
+            config.d_model,
+            config.d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = torch.nn.Dropout(config.dropout_rate)
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class ConformerYMT3SelfAttention(nn.Module):
+    """Construct a ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.head_size = config.d_model // config.num_heads
+        self.num_heads = config.num_heads
+        self.position_encoding_type = config.position_encoding_type
+        self.linear_q = nn.Linear(config.d_model, config.d_model)
+        self.linear_k = nn.Linear(config.d_model, config.d_model)
+        self.linear_v = nn.Linear(config.d_model, config.d_model)
+        self.linear_out = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(p=config.dropout_rate)
+        if self.position_encoding_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.d_model, config.d_model, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, d_model = hidden_states.size()
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+        if self.position_encoding_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_encoding_type == 'rotary'")
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        if self.position_encoding_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError("`relative_position_embeddings` has to be defined when `self.position_encoding_type =="
+                                 " 'relative'")
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(query=query,
+                                                     key=key,
+                                                     relative_position_embeddings=relative_position_embeddings)
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+        # => (batch, time1, d_model)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states, probs
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, d_model = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., :self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2:]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+        return hidden_states
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(relative_position_embeddings.size(0),
+                                                                                   -1, self.num_heads, self.head_size)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, :scores_bd.size(-1) // 2 + 1]
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+        return scores
+class ConformerYMT3EncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.d_model
+        dropout = config.dropout_rate
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = ConformerYMT3FeedForward(config)
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.self_attn = ConformerYMT3SelfAttention(config)
+        # Conformer Convolution
+        self.conv_module = ConformerYMT3ConvolutionModule(config)
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = ConformerYMT3FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states, attn_weigts
+class ConformerYMT3Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.position_encoding_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_encoding_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+        # self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.layers = nn.ModuleList([ConformerYMT3EncoderLayer(config) for _ in range(config.num_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds: torch.FloatTensor,  # (B, T, D)
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ):
+        if output_attentions is None:
+            output_attentions = self.config.output_attentions
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        # inputs_embeds as hidden_states
+        hidden_states = inputs_embeds
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(attention_mask.shape[0], 1, attention_mask.shape[-1],
+                                                   attention_mask.shape[-1])
+        hidden_states = self.dropout(hidden_states)
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+def test():
+    import torch
+    from model.conformer_mod import ConformerYMT3Encoder
+    from model.conformer_helper import ConformerYMT3Config
+    from model.ops import count_parameters
+    config = ConformerYMT3Config()
+    encoder = ConformerYMT3Encoder(config)
+    encoder.eval()
+    # num params: 48,468,992 w/ intermediate_size=2048
+    # num params: 23,278,592 w/ intermediate_size=512
+    x = torch.randn(2, 256, 512)  # (B, T, D)
+    enc_hs = encoder.forward(inputs_embeds=x)['last_hidden_state']  # (B, T, D)

amt/src/model/conv_block.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+from typing import Literal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+def init_layer(layer: nn.Module) -> None:
+    """Initialize a Linear or Convolutional layer."""
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, "bias") and layer.bias is not None:
+        layer.bias.data.zero_()
+def init_bn(bn: nn.Module) -> None:
+    """Initialize a Batchnorm layer."""
+    bn.bias.data.zero_()
+    bn.weight.data.fill_(1.0)
+    bn.running_mean.data.zero_()
+    bn.running_var.data.fill_(1.0)
+def act(x: torch.Tensor, activation: str) -> torch.Tensor:
+    """Activation function."""
+    funcs = {"relu": F.relu_, "leaky_relu": lambda x: F.leaky_relu_(x, 0.01), "swish": lambda x: x * torch.sigmoid(x)}
+    return funcs.get(activation, lambda x: Exception("Incorrect activation!"))(x)
+class Res2DAVPBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, avp_kernel_size, activation):
+        """Convolutional residual block modified fromr bytedance/music_source_separation."""
+        super().__init__()
+        padding = kernel_size[0] // 2, kernel_size[1] // 2
+        self.activation = activation
+        self.bn1, self.bn2 = nn.BatchNorm2d(out_channels), nn.BatchNorm2d(out_channels)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, bias=False)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, padding=padding, bias=False)
+        self.is_shortcut = in_channels != out_channels
+        if self.is_shortcut:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1))
+        self.avp = nn.AvgPool2d(avp_kernel_size)
+        self.init_weights()
+    def init_weights(self):
+        for m in [self.conv1, self.conv2] + ([self.shortcut] if self.is_shortcut else []):
+            init_layer(m)
+        for m in [self.bn1, self.bn2]:
+            init_bn(m)
+    def forward(self, x):
+        origin = x
+        x = act(self.bn1(self.conv1(x)), self.activation)
+        x = self.bn2(self.conv2(x))
+        x += self.shortcut(origin) if self.is_shortcut else origin
+        x = act(x, self.activation)
+        return self.avp(x)
+class PreEncoderBlockRes3B(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=(3, 3), avp_kernerl_size=(1, 2), activation='relu'):
+        """Pre-Encoder with 3 Res2DAVPBlocks."""
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            Res2DAVPBlock(in_channels if i == 0 else out_channels, out_channels, kernel_size, avp_kernerl_size,
+                          activation) for i in range(3)
+        ])
+    def forward(self, x):  # (B, T, F)
+        x = rearrange(x, 'b t f -> b 1 t f')
+        for block in self.blocks:
+            x = block(x)
+        return rearrange(x, 'b c t f -> b t f c')
+def test_res3b():
+    # mel-spec input
+    x = torch.randn(2, 256, 512)  # (B, T, F)
+    pre = PreEncoderBlockRes3B(in_channels=1, out_channels=128)
+    x = pre(x)  # (2, 256, 64, 128): B T,F,C
+    x = torch.randn(2, 110, 1024)  # (B, T, F)
+    pre = PreEncoderBlockRes3B(in_channels=1, out_channels=128)
+    x = pre(x)  # (2, 110, 128, 128): B,T,F,C
+# ====================================================================================================================
+# PreEncoderBlockHFTT: hFT-Transformer-like Pre-encoder
+# ====================================================================================================================
+class PreEncoderBlockHFTT(nn.Module):
+    def __init__(self, margin_pre=15, margin_post=16) -> None:
+        """Pre-Encoder with hFT-Transformer-like convolutions."""
+        super().__init__()
+        self.margin_pre, self.margin_post = margin_pre, margin_post
+        self.conv = nn.Conv2d(1, 4, kernel_size=(1, 5), padding='same', padding_mode='zeros')
+        self.emb_freq = nn.Linear(128, 128)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, T, F)
+        x = rearrange(x, 'b t f -> b 1 f t')  # (B, 1, F, T) or (2, 1, 128, 110)
+        x = F.pad(x, (self.margin_pre, self.margin_post), value=1e-7)  # (B, 1, F, T+margin) or (2,1,128,141)
+        x = self.conv(x)  # (B, C, F, T+margin) or (2, 4, 128, 141)
+        x = x.unfold(dimension=3, size=32, step=1)  # (B, c1, T, F, c2) or (2, 4, 128, 110, 32)
+        x = rearrange(x, 'b c1 f t c2 -> b t f (c1 c2)')  # (B, T, F, C) or (2, 110, 128, 128)
+        return self.emb_freq(x)  # (B, T, F, C) or (2, 110, 128, 128)
+def test_hftt():
+    # from model.spectrogram import get_spectrogram_layer_from_audio_cfg
+    # from config.config import audio_cfg as default_audio_cfg
+    # audio_cfg = default_audio_cfg
+    # audio_cfg['codec'] = 'melspec'
+    # audio_cfg['hop_length'] = 300
+    # audio_cfg['n_mels'] = 128
+    # x = torch.randn(2, 1, 32767)
+    # mspec, _ = get_spectrogram_layer_from_audio_cfg(audio_cfg)
+    # x = mspec(x)
+    x = torch.randn(2, 110, 128)  # (B, T, F)
+    pre_enc_hftt = PreEncoderBlockHFTT()
+    y = pre_enc_hftt(x)  # (2, 110, 128, 128): B, T, F, C
+# ====================================================================================================================
+# PreEncoderBlockRes3BHFTT: hFT-Transformer-like Pre-encoder with Res2DAVPBlock and spec input
+# ====================================================================================================================
+class PreEncoderBlockRes3BHFTT(nn.Module):
+    def __init__(self, margin_pre: int = 15, margin_post: int = 16) -> None:
+        """Pre-Encoder with hFT-Transformer-like convolutions.
+        Args:
+            margin_pre (int): padding before the input
+            margin_post (int): padding after the input
+            stack_dim (Literal['c', 'f']): stack dimension. channel or frequency
+        """
+        super().__init__()
+        self.margin_pre, self.margin_post = margin_pre, margin_post
+        self.res3b = PreEncoderBlockRes3B(in_channels=1, out_channels=4)
+        self.emb_freq = nn.Linear(128, 128)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, T, F) or (2, 110, 1024), input spectrogram
+        x = rearrange(x, 'b t f -> b f t')  # (2, 1024, 110): B,F,T
+        x = F.pad(x, (self.margin_pre, self.margin_post), value=1e-7)  # (2, 1024, 141): B,F,T+margin
+        x = rearrange(x, 'b f t -> b t f')  # (2, 141, 1024): B,T+margin,F
+        x = self.res3b(x)  # (2, 141, 128, 4): B,T+margin,F,C
+        x = x.unfold(dimension=1, size=32, step=1)  # (B, T, F, C1, C2) or (2, 110, 128, 4, 32)
+        x = rearrange(x, 'b t f c1 c2 -> b t f (c1 c2)')  # (B, T, F, C) or (2, 110, 128, 128)
+        return self.emb_freq(x)  # (B, T, F, C) or (2, 110, 128, 128)
+def test_res3b_hftt():
+    # from model.spectrogram import get_spectrogram_layer_from_audio_cfg
+    # from config.config import audio_cfg as default_audio_cfg
+    # audio_cfg = default_audio_cfg
+    # audio_cfg['codec'] = 'spec'
+    # audio_cfg['hop_length'] = 300
+    # x = torch.randn(2, 1, 32767)
+    # spec, _ = get_spectrogram_layer_from_audio_cfg(audio_cfg)
+    # x = spec(x)  # (2, 110, 1024): B,T,F
+    x = torch.randn(2, 110, 1024)  # (B, T, F)
+    pre_enc_res3b_hftt = PreEncoderBlockRes3BHFTT()
+    y = pre_enc_res3b_hftt(x)  # (2, 110, 128, 128): B, T, F, C
+# # ====================================================================================================================
+# # PreEncoderBlockConv1D: Pre-encoder without activation, with Melspec input
+# # ====================================================================================================================
+# class PreEncoderBlockConv1D(nn.Module):
+#     def __init__(self,
+#                  in_channels,
+#                  out_channels,
+#                  kernel_size=3) -> None:
+#         """Pre-Encoder with 1D convolution."""
+#         super().__init__()
+#         self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=1)
+#         self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, stride=1)
+#     def forward(self, x: torch.Tensor) -> torch.Tensor:
+#         # x: (B, T, F) or (2, 128, 256), input melspec
+#         x = rearrange(x, 'b t f -> b f t')  # (2, 256, 128): B,F,T
+#         x = self.conv1(x)  # (2, 128, 128): B,F,T
+#         return rearrange(x, 'b f t -> b t f')  # (2, 110, 128): B,T,F
+# def test_conv1d():
+#     # from model.spectrogram import get_spectrogram_layer_from_audio_cfg
+#     # from config.config import audio_cfg as default_audio_cfg
+#     # audio_cfg = default_audio_cfg
+#     # audio_cfg['codec'] = 'melspec'
+#     # audio_cfg['hop_length'] = 256
+#     # audio_cfg['n_mels'] = 512
+#     # x = torch.randn(2, 1, 32767)
+#     # mspec, _ = get_spectrogram_layer_from_audio_cfg(audio_cfg)
+#     # x = mspec(x)
+#     x = torch.randn(2, 128, 128)  # (B, T, F)
+#     pre_enc_conv1d = PreEncoderBlockConv1D(in_channels=1, out_channels=128)
+#     y = pre_enc_conv1d(x)  # (2, 110, 128, 128): B, T, F, C

amt/src/model/ff_layer.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""ff_layer.py
+This module contains the implementation of the feedforward layers.
+    Supported ff_layer_type:
+        'mlp': Multi-Layer Perceptron
+        'gmlp': Gated Multi-Layer Perceptron, simplified version of Mixtral Expert with num_experts=1 and top_k=1.
+                This is not the spatial gating MLP (https://arxiv.org/abs/2105.08050).
+        'moe': Mixtral of Experts, modified from the original source code:
+            https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mixtral/modeling_mixtral.py
+    Usage:
+        from model.ff_layer import get_ff_layer
+        config = PerceiverTFConfig() # or any type of PretrainedConfig()
+        config.ff_layer_type = 'moe' # or 'mlp'
+        config.moe_num_experts = 4
+        config.moe_topk = 2
+        config.hidden_act = 'gelu' # or any type of activation function, e.g., 'silu'
+        ff_layer = get_ff_layer(config, input_size, widening_factor)
+    What ff_layer returns:
+        - It returns (hidden_states, router_logits) for MoE and (hidden_states, None) for MLP.
+        - router_logits has the shape of (batch_size * sequence_length, n_experts) for MoE.
+"""
+from typing import Any, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.configuration_utils import PretrainedConfig
+from transformers.activations import ACT2FN
+from model.ops import get_layer_norm
+from model.ops import optional_compiler_disable, optional_compiler_dynamic
+class MixtralBlockSparseTop2MLP(nn.Module):
+    """
+    The Gated Multilayer Perceptron (GMLP) used in Mixtral of Experts (MoE).
+    """
+    def __init__(self, config: PretrainedConfig, input_size: int, widening_factor: int):
+        super().__init__()
+        self.hidden_dim = input_size
+        self.ffn_dim = int(input_size * widening_factor)
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.gate = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.gate(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+class MixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+    def __init__(self, config, input_size: int, widening_factor: int):
+        super().__init__()
+        self.hidden_dim = input_size
+        self.widening_factor = widening_factor
+        self.num_experts = config.moe_num_experts
+        self.top_k = config.moe_topk
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [MixtralBlockSparseTop2MLP(config, self.hidden_dim, self.widening_factor) for _ in range(self.num_experts)])
+    @optional_compiler_disable
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros((batch_size * sequence_length, hidden_dim),
+                                          dtype=hidden_states.dtype,
+                                          device=hidden_states.device)
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+class MLP(nn.Module):
+    """A Standard Transformer-style dense module to follow attention."""
+    def __init__(self, config: PretrainedConfig, input_size: int, widening_factor: int):
+        super().__init__()
+        self.dense1 = nn.Linear(input_size, widening_factor * input_size)
+        self.dense2 = nn.Linear(widening_factor * input_size, input_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, Any]:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        return hidden_states, None
+class SimpleGMLP(nn.Module):
+    """A Simple Gated Multilayer Perceptron (aka. 'gmlp'), without the spatial gating mechanism.
+    Note that this is not the spatial gating MLP (https://arxiv.org/abs/2105.08050).
+    - A simplified MLP w/ gating mechanism adapted from Mixtral Expert, as when
+    the number of experts and top_k are both set to 1.)
+    - Added a dropout layer.
+    - This was also used in T5 v1.1.
+    """
+    def __init__(self, config: PretrainedConfig, input_size: int, widening_factor: int):
+        super().__init__()
+        self.hidden_dim = input_size
+        self.ffn_dim = int(input_size * widening_factor)
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.gate = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.dropout1 = nn.Dropout(config.dropout_rate)
+        self.dropout2 = nn.Dropout(config.dropout_rate)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.gate(hidden_states)
+        current_hidden_states = self.dropout1(current_hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        current_hidden_states = self.dropout2(
+            current_hidden_states)  # Residual connection is applied outside of this module.
+        return current_hidden_states, None
+def get_ff_layer(config: PretrainedConfig, input_size: int, widening_factor: int):
+    if config.ff_layer_type == 'moe':
+        assert hasattr(config, 'moe_num_experts') and hasattr(config, 'moe_topk') and hasattr(config, 'hidden_act')
+        return MixtralSparseMoeBlock(config, input_size, widening_factor)
+    elif config.ff_layer_type == 'mlp':
+        assert hasattr(config, 'hidden_act')
+        return MLP(config, input_size, widening_factor)
+    elif config.ff_layer_type == 'gmlp':
+        assert hasattr(config, 'hidden_act')
+        return SimpleGMLP(config, input_size, widening_factor)
+    else:
+        raise ValueError(
+            f"Unsupported ff_layer_type: {config.ff_layer_type}. Supported types are 'moe', 'mlp' and 'gmlp'.")
+def test_get_ff_layer():
+    from model.ff_layer import get_ff_layer
+    from model.perceiver_helper import PerceiverTFConfig
+    input_size = 32
+    widening_factor = 1
+    # Test for MoE
+    config = PerceiverTFConfig()  # or any type of PretrainedConfig()
+    config.ff_layer_type = 'moe'
+    config.moe_num_experts = 4
+    config.moe_topk = 2
+    config.hidden_act = 'silu'
+    ff_layer = get_ff_layer(config, input_size, widening_factor)
+    x = torch.rand(2, 8, input_size)
+    hidden_states, router_logits = ff_layer(x)
+    print(hidden_states.shape, router_logits.shape)  # (2, 8, 32), (2*8, 4)
+    # Test for MLP
+    config.ff_layer_type = 'mlp'
+    config.hidden_act = 'gelu'
+    ff_layer = get_ff_layer(config, input_size, widening_factor)
+    hidden_states, _ = ff_layer(x)
+    print(hidden_states.shape)  # (2, 8, 32)
+    # Test for (simple)gMLP
+    config.ff_layer_type = 'gmlp'
+    config.hidden_act = 'silu'
+    ff_layer = get_ff_layer(config, input_size, widening_factor)
+    hidden_states, _ = ff_layer(x)
+    print(hidden_states.shape)  # (2, 8, 32)

amt/src/model/init_train.py ADDED Viewed

	@@ -0,0 +1,281 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""init_train.py"""
+from typing import Tuple, Literal, Any
+from copy import deepcopy
+import os
+import argparse
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.utilities import rank_zero_only
+from config.config import shared_cfg as default_shared_cfg
+from config.config import audio_cfg as default_audio_cfg
+from config.config import model_cfg as default_model_cfg
+from config.config import DEEPSPEED_CFG
+def initialize_trainer(args: argparse.Namespace,
+                       stage: Literal['train', 'test'] = 'train') -> Tuple[pl.Trainer, WandbLogger, dict]:
+    """Initialize trainer and logger"""
+    shared_cfg = deepcopy(default_shared_cfg)
+    # create save dir
+    os.makedirs(shared_cfg["WANDB"]["save_dir"], exist_ok=True)
+    # collecting specific checkpoint from exp_id with extension (@xxx where xxx is checkpoint name)
+    if "@" in args.exp_id:
+        args.exp_id, checkpoint_name = args.exp_id.split("@")
+    else:
+        checkpoint_name = "last.ckpt"
+    # checkpoint dir
+    lightning_dir = os.path.join(shared_cfg["WANDB"]["save_dir"], args.project, args.exp_id)
+    # create logger
+    if args.wandb_mode is not None:
+        shared_cfg["WANDB"]["mode"] = str(args.wandb_mode)
+    if shared_cfg["WANDB"].get("cache_dir", None) is not None:
+        os.environ["WANDB_CACHE_DIR"] = shared_cfg["WANDB"].get("cache_dir")
+        del shared_cfg["WANDB"]["cache_dir"]  # remove cache_dir from shared_cfg
+    wandb_logger = WandbLogger(log_model="all",
+                               project=args.project,
+                               id=args.exp_id,
+                               allow_val_change=True,
+                               **shared_cfg['WANDB'])
+    # check if any checkpoint exists
+    last_ckpt_path = os.path.join(lightning_dir, "checkpoints", checkpoint_name)
+    if os.path.exists(os.path.join(last_ckpt_path)):
+        print(f'Resuming from {last_ckpt_path}')
+    elif stage == 'train':
+        print(f'No checkpoint found in {last_ckpt_path}. Starting from scratch')
+        last_ckpt_path = None
+    else:
+        raise ValueError(f'No checkpoint found in {last_ckpt_path}. Quit...')
+    # add info
+    dir_info = dict(lightning_dir=lightning_dir, last_ckpt_path=last_ckpt_path)
+    # define checkpoint callback
+    checkpoint_callback = ModelCheckpoint(**shared_cfg["CHECKPOINT"],)
+    # define lr scheduler monitor callback
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    # deepspeed strategy
+    if args.strategy == 'deepspeed':
+        strategy = pl.strategies.DeepSpeedStrategy(config=DEEPSPEED_CFG)
+    # validation interval
+    if stage == 'train' and args.val_interval is not None:
+        shared_cfg["TRAINER"]["check_val_every_n_epoch"] = None
+        shared_cfg["TRAINER"]["val_check_interval"] = int(args.val_interval)
+    # define trainer
+    sync_batchnorm = False
+    if stage == 'train':
+        # train batch size
+        if args.train_batch_size is not None:
+            train_sub_bsz = int(args.train_batch_size[0])
+            train_local_bsz = int(args.train_batch_size[1])
+            if train_local_bsz % train_sub_bsz == 0:
+                shared_cfg["BSZ"]["train_sub"] = train_sub_bsz
+                shared_cfg["BSZ"]["train_local"] = train_local_bsz
+            else:
+                raise ValueError(
+                    f'Local batch size {train_local_bsz} must be divisible by sub batch size {train_sub_bsz}')
+        # ddp strategy
+        if args.strategy == 'ddp':
+            args.strategy = 'ddp_find_unused_parameters_true'  # fix for conformer or pitchshifter having unused parameter issue
+            # sync-batchnorm
+            if args.sync_batchnorm is True:
+                sync_batchnorm = True
+    train_params = dict(**shared_cfg["TRAINER"],
+                        devices=args.num_gpus if args.num_gpus == 'auto' else int(args.num_gpus),
+                        num_nodes=int(args.num_nodes),
+                        strategy=strategy if args.strategy == 'deepspeed' else args.strategy,
+                        precision=args.precision,
+                        max_epochs=args.max_epochs if stage == 'train' else None,
+                        max_steps=args.max_steps if stage == 'train' else -1,
+                        logger=wandb_logger,
+                        callbacks=[checkpoint_callback, lr_monitor],
+                        sync_batchnorm=sync_batchnorm)
+    trainer = pl.trainer.trainer.Trainer(**train_params)
+    # Update wandb logger (for DDP)
+    if trainer.global_rank == 0:
+        wandb_logger.experiment.config.update(args, allow_val_change=True)
+    return trainer, wandb_logger, dir_info, shared_cfg
+def update_config(args, shared_cfg, stage: Literal['train', 'test'] = 'train'):
+    """Update audio/model/shared configurations with args"""
+    audio_cfg = default_audio_cfg
+    model_cfg = default_model_cfg
+    # Only update config when training
+    if stage == 'train':
+        # Augmentation parameters
+        if args.random_amp_range is not None:
+            shared_cfg["AUGMENTATION"]["train_random_amp_range"] = list(
+                (float(args.random_amp_range[0]), float(args.random_amp_range[1])))
+        if args.stem_iaug_prob is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_iaug_prob"] = float(args.stem_iaug_prob)
+        if args.xaug_max_k is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["max_k"] = int(args.xaug_max_k)
+        if args.xaug_tau is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["tau"] = float(args.xaug_tau)
+        if args.xaug_alpha is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["alpha"] = float(args.xaug_alpha)
+        if args.xaug_no_instr_overlap is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["no_instr_overlap"] = bool(args.xaug_no_instr_overlap)
+        if args.xaug_no_drum_overlap is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["no_drum_overlap"] = bool(args.xaug_no_drum_overlap)
+        if args.uhat_intra_stem_augment is not None:
+            shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]["uhat_intra_stem_augment"] = bool(
+                args.uhat_intra_stem_augment)
+        if args.pitch_shift_range is not None:
+            if args.pitch_shift_range in [["0", "0"], [0, 0]]:
+                shared_cfg["AUGMENTATION"]["train_pitch_shift_range"] = None
+            else:
+                shared_cfg["AUGMENTATION"]["train_pitch_shift_range"] = list(
+                    (int(args.pitch_shift_range[0]), int(args.pitch_shift_range[1])))
+        train_stem_iaug_prob = shared_cfg["AUGMENTATION"]["train_stem_iaug_prob"]
+        random_amp_range = shared_cfg["AUGMENTATION"]["train_random_amp_range"]
+        train_stem_xaug_policy = shared_cfg["AUGMENTATION"]["train_stem_xaug_policy"]
+        print(f'Random amp range: {random_amp_range}\n' +
+              f'Intra-stem augmentation probability: {train_stem_iaug_prob}\n' +
+              f'Stem augmentation policy: {train_stem_xaug_policy}\n' +
+              f'Pitch shift range: {shared_cfg["AUGMENTATION"]["train_pitch_shift_range"]}\n')
+    # Update audio config
+    if args.audio_codec != None:
+        assert args.audio_codec in ['spec', 'melspec']
+        audio_cfg["codec"] = str(args.audio_codec)
+    if args.hop_length != None:
+        audio_cfg["hop_length"] = int(args.hop_length)
+    if args.n_mels != None:
+        audio_cfg["n_mels"] = int(args.n_mels)
+    if args.input_frames != None:
+        audio_cfg["input_frames"] = int(args.input_frames)
+    # Update shared config
+    if shared_cfg["TOKENIZER"]["max_shift_steps"] == "auto":
+        shift_steps_ms = shared_cfg["TOKENIZER"]["shift_step_ms"]
+        input_frames = audio_cfg["input_frames"]
+        fs = audio_cfg["sample_rate"]
+        max_shift_steps = (input_frames / fs) // (shift_steps_ms / 1000) + 2  # 206 by default
+        shared_cfg["TOKENIZER"]["max_shift_steps"] = int(max_shift_steps)
+    # Update model config
+    if args.encoder_type != None:
+        model_cfg["encoder_type"] = str(args.encoder_type)
+    if args.decoder_type != None:
+        model_cfg["decoder_type"] = str(args.decoder_type)
+    if args.pre_encoder_type != "default":
+        model_cfg["pre_encoder_type"] = str(args.pre_encoder_type)
+    if args.pre_decoder_type != 'default':
+        model_cfg["pre_decoder_type"] = str(args.pre_decoder_type)
+    if args.conv_out_channels != None:
+        model_cfg["conv_out_channels"] = int(args.conv_out_channels)
+    assert isinstance(args.task_cond_decoder, bool) and isinstance(args.task_cond_encoder, bool)
+    model_cfg["use_task_conditional_encoder"] = args.task_cond_encoder
+    model_cfg["use_task_conditional_decoder"] = args.task_cond_decoder
+    if args.encoder_position_encoding_type != 'default':
+        if args.encoder_position_encoding_type in ['None', 'none', '0']:
+            model_cfg["encoder"][model_cfg["encoder_type"]]["position_encoding_type"] = None
+        elif args.encoder_position_encoding_type in [
+                'sinusoidal', 'rope', 'trainable', 'alibi', 'alibit', 'tkd', 'td', 'tk', 'kdt'
+        ]:
+            model_cfg["encoder"][model_cfg["encoder_type"]]["position_encoding_type"] = str(
+                args.encoder_position_encoding_type)
+        else:
+            raise ValueError(f'Encoder PE type {args.encoder_position_encoding_type} not supported')
+    if args.decoder_position_encoding_type != 'default':
+        if args.decoder_position_encoding_type in ['None', 'none', '0']:
+            raise ValueError('Decoder PE type cannot be None')
+        elif args.decoder_position_encoding_type in ['sinusoidal', 'trainable']:
+            model_cfg["decoder"][model_cfg["decoder_type"]]["position_encoding_type"] = str(
+                args.decoder_position_encoding_type)
+        else:
+            raise ValueError(f'Decoder PE {args.decoder_position_encoding_type} not supported')
+    if args.tie_word_embedding is not None:
+        model_cfg["tie_word_embedding"] = bool(args.tie_word_embedding)
+    if args.d_feat != None:
+        model_cfg["d_feat"] = int(args.d_feat)
+    if args.d_latent != None:
+        model_cfg['encoder']['perceiver-tf']["d_latent"] = int(args.d_latent)
+    if args.num_latents != None:
+        model_cfg['encoder']['perceiver-tf']['num_latents'] = int(args.num_latents)
+    if args.perceiver_tf_d_model != None:
+        model_cfg['encoder']['perceiver-tf']['d_model'] = int(args.perceiver_tf_d_model)
+    if args.num_perceiver_tf_blocks != None:
+        model_cfg["encoder"]["perceiver-tf"]["num_blocks"] = int(args.num_perceiver_tf_blocks)
+    if args.num_perceiver_tf_local_transformers_per_block != None:
+        model_cfg["encoder"]["perceiver-tf"]["num_local_transformers_per_block"] = int(
+            args.num_perceiver_tf_local_transformers_per_block)
+    if args.num_perceiver_tf_temporal_transformers_per_block != None:
+        model_cfg["encoder"]["perceiver-tf"]["num_temporal_transformers_per_block"] = int(
+            args.num_perceiver_tf_temporal_transformers_per_block)
+    if args.attention_to_channel != None:
+        model_cfg["encoder"]["perceiver-tf"]["attention_to_channel"] = bool(args.attention_to_channel)
+    if args.sca_use_query_residual != None:
+        model_cfg["encoder"]["perceiver-tf"]["sca_use_query_residual"] = bool(args.sca_use_query_residual)
+    if args.layer_norm_type != None:
+        model_cfg["encoder"]["perceiver-tf"]["layer_norm"] = str(args.layer_norm_type)
+    if args.ff_layer_type != None:
+        model_cfg["encoder"]["perceiver-tf"]["ff_layer_type"] = str(args.ff_layer_type)
+    if args.ff_widening_factor != None:
+        model_cfg["encoder"]["perceiver-tf"]["ff_widening_factor"] = int(args.ff_widening_factor)
+    if args.moe_num_experts != None:
+        model_cfg["encoder"]["perceiver-tf"]["moe_num_experts"] = int(args.moe_num_experts)
+    if args.moe_topk != None:
+        model_cfg["encoder"]["perceiver-tf"]["moe_topk"] = int(args.moe_topk)
+    if args.hidden_act != None:
+        model_cfg["encoder"]["perceiver-tf"]["hidden_act"] = str(args.hidden_act)
+    if args.rotary_type != None:
+        assert len(
+            args.rotary_type
+        ) == 3, "rotary_type must be a 3-letter string (e.g. 'ppl': 'pixel' for SCA, 'pixel' for latent, 'lang' for temporal transformer)"
+        model_cfg["encoder"]["perceiver-tf"]["rotary_type_sca"] = str(args.rotary_type)[0]
+        model_cfg["encoder"]["perceiver-tf"]["rotary_type_latent"] = str(args.rotary_type)[1]
+        model_cfg["encoder"]["perceiver-tf"]["rotary_type_temporal"] = str(args.rotary_type)[2]
+    if args.rope_apply_to_keys != None:
+        model_cfg["encoder"]["perceiver-tf"]["rope_apply_to_keys"] = bool(args.rope_apply_to_keys)
+    if args.rope_partial_pe != None:
+        model_cfg["encoder"]["perceiver-tf"]["rope_partial_pe"] = bool(args.rope_partial_pe)
+    if args.decoder_ff_layer_type != None:
+        model_cfg["decoder"][model_cfg["decoder_type"]]["ff_layer_type"] = str(args.decoder_ff_layer_type)
+    if args.decoder_ff_widening_factor != None:
+        model_cfg["decoder"][model_cfg["decoder_type"]]["ff_widening_factor"] = int(args.decoder_ff_widening_factor)
+    if args.event_length != None:
+        model_cfg["event_length"] = int(args.event_length)
+    if stage == 'train':
+        if args.encoder_dropout_rate != None:
+            model_cfg["encoder"][model_cfg["encoder_type"]]["dropout_rate"] = float(args.encoder_dropout_rate)
+        if args.decoder_dropout_rate != None:
+            model_cfg["decoder"][model_cfg["decoder_type"]]["dropout_rate"] = float(args.decoder_dropout_rate)
+    return shared_cfg, audio_cfg, model_cfg  # return updated configs

amt/src/model/lm_head.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""lm_head.py"""
+import torch
+from torch import nn
+from typing import Optional, Dict
+class LMHead(nn.Module):
+    """Language Model Head with tied weights."""
+    def __init__(self, decoder_config: Dict, init_factor: float = 1.0, tie_word_embeddings: bool = True):
+        super().__init__()
+        self.d_model = decoder_config["d_model"]
+        self.init_factor = init_factor
+        self.tie_word_embeddings = tie_word_embeddings
+        self.lm_head = nn.Linear(decoder_config["d_model"], decoder_config["vocab_size"], bias=False)
+        self._init_weights()
+    def _init_weights(self):
+        if self.tie_word_embeddings is False:
+            self.lm_head.weight.data.normal_(mean=0.0, std=self.init_factor * 1.0)
+    def forward(self, decoder_hs: torch.FloatTensor) -> torch.FloatTensor:
+        if self.tie_word_embeddings is True:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            decoder_hs = decoder_hs * (self.d_model**-0.5)
+        lm_logits = self.lm_head(decoder_hs)
+        return lm_logits

amt/src/model/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+"""lr_schedule.py"""
+import torch
+from typing import Dict, Optional
+def get_lr_scheduler(optimizer: torch.optim.Optimizer, scheduler_name: str, base_lr: float, scheduler_cfg: Dict):
+    if scheduler_name.lower() == 'cosine':
+        from torch.optim.lr_scheduler import (
+            SequentialLR,
+            LinearLR,
+            CosineAnnealingLR,
+        )
+        scheduler1 = LinearLR(
+            optimizer,
+            start_factor=0.5,
+            end_factor=1,
+            total_iters=scheduler_cfg["warmup_steps"],
+            last_epoch=-1,
+        )
+        scheduler2 = CosineAnnealingLR(
+            optimizer,
+            T_max=scheduler_cfg["total_steps"] - scheduler_cfg["warmup_steps"],
+            eta_min=scheduler_cfg["final_cosine"],
+        )
+        lr_scheduler = SequentialLR(optimizer,
+                                    schedulers=[scheduler1, scheduler2],
+                                    milestones=[scheduler_cfg["warmup_steps"]])
+    elif scheduler_name.lower() == 'legacy':
+        import math
+        from torch.optim.lr_scheduler import (
+            SequentialLR,
+            LinearLR,
+            LambdaLR,
+        )
+        msg = "You are using T5 legacy LR Schedule, it's independent from the optim.base_lr"
+        print(msg)
+        num_steps_optimizer1 = math.ceil(scheduler_cfg["total_steps"] * 0.9)
+        iters_left_for_optimizer2 = scheduler_cfg["total_steps"] - num_steps_optimizer1
+        scheduler1 = LambdaLR(optimizer, lambda step: min(base_lr, 1.0 / math.sqrt(step)) / base_lr
+                              if step else base_lr / base_lr)
+        scheduler2 = LinearLR(optimizer,
+                              start_factor=(min(base_lr, 1.0 / math.sqrt(num_steps_optimizer1)) / base_lr),
+                              end_factor=0,
+                              total_iters=iters_left_for_optimizer2,
+                              last_epoch=-1)
+        lr_scheduler = SequentialLR(
+            optimizer,
+            schedulers=[scheduler1, scheduler2],
+            milestones=[num_steps_optimizer1],
+        )
+    elif scheduler_name.lower() == 'constant':
+        from transformers import get_scheduler
+        lr_scheduler = get_scheduler(
+            name=scheduler_name.lower(),
+            optimizer=optimizer,
+        )
+    else:
+        raise NotImplementedError
+    return lr_scheduler
+def extra_stats(args, model, optimizer):
+    stats = {}
+    if args.logging.weights_l2:
+        weights_l2 = sum(p.detach().norm(2).item()**2 for p in model.parameters())**0.5
+        stats['weights_l2'] = weights_l2
+    cur_lr = optimizer.param_groups[0]['lr']
+    stats['lr'] = cur_lr
+    return stats

amt/src/model/ops.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+""" op.py """
+import math
+from packaging.version import parse as VersionParse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.models.t5.modeling_t5 import T5LayerNorm as RMSNorm
+def get_layer_norm(dim: int, layer_norm_type: str = "layer_norm", layer_norm_eps: float = 1e-5):
+    """Get layer normalization layer.
+    Args:
+        dim (int): Feature dimension
+        layer_norm_type (str): "layer_norm" or "rms_norm"
+        layer_norm_eps (float): Epsilon value for numerical stability
+    Returns:
+        nn.Module: Layer normalization layer
+    """
+    if layer_norm_type == "rms_norm":
+        # T5LayerNorm is equivalent to RMSNorm. https://arxiv.org/abs/1910.07467
+        return RMSNorm(hidden_size=dim, eps=layer_norm_eps)
+    else:
+        return nn.LayerNorm(normalized_shape=dim, eps=layer_norm_eps)
+def check_all_elements_equal(x: torch.Tensor) -> bool:
+    return x.eq(x[0]).all().item()
+def minmax_normalize(x: torch.Tensor, eps: float = 0.008) -> torch.FloatTensor:
+    """Min-max normalization:
+    x_norm = (x - x_min) / (x_max - x_min + eps)
+    Args:
+        x (torch.Tensor): (B, T, F)
+    Returns:
+        torch.Tensor: (B, T, F) with output range of [0, 1]
+    """
+    x_max = rearrange(x, "b t f -> b (t f)").max(1, keepdim=True)[0]
+    x_min = rearrange(x, "b t f -> b (f t)").min(1, keepdim=True)[0]
+    x_max = x_max[:, None, :]  # (B,1,1)
+    x_min = x_min[:, None, :]  # (B,1,1)
+    return (x - x_min) / (x_max - x_min + eps)
+def count_parameters(model):
+    num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    num_params = sum(p.numel() for p in model.parameters())
+    return num_trainable_params, num_params
+def adjust_b_to_gcd(a, b, min_gcd=16):
+    """
+    Adjust the value of b to ensure the GCD(a, b) is at least min_gcd with minimum change to b.
+    Parameters:
+    - a (int): A positive integer
+    - b (int): A positive integer
+    - min_gcd (int): The minimum desired GCD
+    Returns:
+    - int: The adjusted value of b
+    """
+    current_gcd = math.gcd(a, b)
+    # If current GCD is already greater than or equal to min_gcd, return b as it is.
+    if current_gcd >= min_gcd:
+        return b
+    # If a is less than min_gcd, then it's impossible to get a GCD of at least min_gcd.
+    if a < min_gcd:
+        raise ValueError("a must be at least as large as min_gcd.")
+    # Adjust b by trying increments and decrements, preferring the smallest absolute change.
+    adjusted_b_up = b
+    adjusted_b_down = b
+    while True:
+        adjusted_b_up += 1
+        adjusted_b_down -= 1
+        if math.gcd(a, adjusted_b_up) >= min_gcd:
+            return adjusted_b_up
+        elif math.gcd(a, adjusted_b_down) >= min_gcd:
+            return adjusted_b_down
+def optional_compiler_disable(func):
+    if VersionParse(torch.__version__) >= VersionParse("2.1"):
+        # If the version is 2.1 or higher, apply the torch.compiler.disable decorator.
+        return torch.compiler.disable(func)
+    else:
+        # If the version is below 2.1, return the original function.
+        return func
+def optional_compiler_dynamic(func):
+    return torch.compile(func, dynamic=True)

amt/src/model/optimizers.py ADDED Viewed

	@@ -0,0 +1,218 @@

+""" optimizers.py
+Code based on nanoT5 project:
+    https://github.com/PiotrNawrot/nanoT5/blob/main/nanoT5/utils/copied_utils.py
++ D-adapt Adam from https://github.com/facebookresearch/dadaptation
+"""
+import importlib
+import math
+import torch
+from typing import Iterable, Tuple
+from torch import nn
+from torch.optim import Optimizer
+from transformers import Adafactor
+from torch.optim import AdamW
+class AdamWScale(Optimizer):
+    """
+    This AdamW implementation is copied from Huggingface.
+    We modified it with Adagrad scaling by rms of a weight tensor
+    Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
+    Regularization](https://arxiv.org/abs/1711.05101).
+    Parameters:
+        params (`Iterable[nn.parameter.Parameter]`):
+            Iterable of parameters to optimize or dictionaries defining parameter groups.
+        lr (`float`, *optional*, defaults to 1e-3):
+            The learning rate to use.
+        betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
+            Adam's betas parameters (b1, b2).
+        eps (`float`, *optional*, defaults to 1e-6):
+            Adam's epsilon for numerical stability.
+        weight_decay (`float`, *optional*, defaults to 0):
+            Decoupled weight decay to apply.
+        correct_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
+        no_deprecation_warning (`bool`, *optional*, defaults to `False`):
+            A flag used to disable the deprecation warning (set to `True` to disable the warning).
+    """
+    def __init__(
+        self,
+        params: Iterable[nn.parameter.Parameter],
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-6,
+        weight_decay: float = 0.0,
+        correct_bias: bool = True,
+    ):
+        if lr < 0.0:
+            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
+        super().__init__(params, defaults)
+    @staticmethod
+    def _rms(tensor):
+        return tensor.norm(2) / (tensor.numel()**0.5)
+    def step(self, closure=None):
+        """
+        Performs a single optimization step.
+        Arguments:
+            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Adam does not support sparse gradients, please consider SparseAdam instead"
+                    )
+                state = self.state[p]
+                beta1, beta2 = group["betas"]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                state["step"] += 1
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
+                step_size = group["lr"]
+                if group["correct_bias"]:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1**state["step"]
+                    bias_correction2 = 1.0 - beta2**state["step"]
+                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
+                # /Adapt Step from Adagrad
+                step_size = step_size * max(1e-3, self._rms(p.data))
+                # /Adapt Step from Adagrad
+                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                if group["weight_decay"] > 0.0:
+                    p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
+        return loss
+# def get_optimizer(models_dict: nn.ModuleDict,
+#                   optimizer_name: str,
+#                   base_lr: float,
+#                   weight_decay: float = 0.):
+#     no_decay = [
+#         "bias", "LayerNorm", "layernorm", "layer_norm", "ln", "BatchNorm", "bn", "batch_norm",
+#         "batchnorm"
+#     ]
+#     optimizer_grouped_parameters = []
+#     for name, current_model in models_dict.items():
+#         if current_model is None:
+#             continue
+#         optimizer_grouped_parameters += [
+#             {
+#                 "params": [
+#                     p for n, p in current_model.named_parameters()
+#                     if not any(nd in n for nd in no_decay)
+#                 ],
+#                 "weight_decay": weight_decay,
+#             },
+#             {
+#                 "params": [
+#                     p for n, p in current_model.named_parameters()
+#                     if any(nd in n for nd in no_decay)
+#                 ],
+#                 "weight_decay": 0.0,
+#             },
+#         ]
+def get_optimizer(models_dict: nn.ModuleDict,
+                  optimizer_name: str,
+                  base_lr: float,
+                  weight_decay: float = 0.):
+    no_decay = [
+        "bias", "LayerNorm", "layernorm", "layer_norm", "ln", "BatchNorm", "bn", "batch_norm",
+        "batchnorm"
+    ]
+    optimizer_grouped_parameters = []
+    for n, p in models_dict:
+        # drop pitch shifter
+        if 'pshifters' in n:
+            continue
+        # no decay
+        if n in no_decay:
+            optimizer_grouped_parameters.append({"params": [p], "weight_decay": 0.0})
+        else:
+            optimizer_grouped_parameters.append({"params": [p], "weight_decay": weight_decay})
+    if optimizer_name.lower() == 'adamw':
+        base_lr = 1e-03 if base_lr == None else float(base_lr)
+        opt = AdamW(optimizer_grouped_parameters, lr=base_lr)
+    elif optimizer_name.lower() == 'adafactor':
+        if base_lr == None:
+            opt = Adafactor(
+                optimizer_grouped_parameters,
+                lr=None,
+                scale_parameter=True,
+                relative_step=True,
+                warmup_init=True)
+        else:
+            opt = Adafactor(optimizer_grouped_parameters, lr=base_lr, relative_step=False)
+    elif optimizer_name.lower() == 'adamwscale':
+        base_lr = 1e-02 if base_lr == None else float(base_lr)
+        opt = AdamWScale(
+            optimizer_grouped_parameters,
+            lr=base_lr,
+        )
+    elif optimizer_name.lower() == 'cpuadam':
+        dspd = importlib.import_module('deepspeed')
+        base_lr = 1e-03 if base_lr == None else float(base_lr)
+        opt = dspd.ops.adam.cpu_adam.DeepSpeedCPUAdam(optimizer_grouped_parameters, lr=base_lr)
+    elif optimizer_name.lower() == 'dadaptadam':
+        dadaptation = importlib.import_module('dadaptation')
+        base_lr = 1.0 if base_lr == None else float(base_lr)
+        opt = dadaptation.DAdaptAdam(optimizer_grouped_parameters, lr=base_lr)
+    else:
+        raise NotImplementedError(optimizer_name)
+    return opt, base_lr

amt/src/model/perceiver_helper.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright 2024 The YourMT3 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Please see the details in the LICENSE file.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import nn
+from transformers.utils import ModelOutput
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+# from transformers.models.perceiver.modeling_perceiver import (PerceiverAbstractPositionEncoding,
+#                                                               PerceiverTrainablePositionEncoding,
+#                                                               PerceiverFourierPositionEncoding)
+class PerceiverTFConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PerceiverTF`]. It is used to instantiate an
+    Perceiver model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Perceiver
+    [deepmind/language-perceiver](https://huggingface.co/deepmind/language-perceiver) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_latents (`int`, *optional*, defaults to 256):
+            The number of latents.
+        d_latents (`int`, *optional*, defaults to 1280):
+            Dimension of the latent embeddings.
+        d_model (`int`, *optional*, defaults to 768):
+            Dimension of the inputs. Should only be provided in case [*PerceiverTextPreprocessor*] is used or no
+            preprocessor is provided.
+        kv_dim (`int`, *optional*, defaults to 128):
+        num_blocks (`int`, *optional*, defaults to 1):
+            Number of blocks in the Transformer encoder.
+        num_self_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each self-attention layer in the Transformer encoder.
+        num_cross_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each cross-attention layer in the Transformer encoder.
+        num_local_transformers_per_block (`int`, *optional*, defaults to 2):
+            Number of local Transformer layers per Transformer block in the Transformer encoder.
+        num_temporal_transformers_per_block (`int`, *optional*, defaults to 2):
+            Number of temporal Transformer layers per Transformer block in the Transformer encoder.
+        shared_parallel_temporal_transformers (`bool`, *optional*, defaults to `False`):
+            Whether to share the parameters across the K parallel temporal Transformers in each block.
+        qk_channels (`int`, *optional*):
+            Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
+            layers of the encoder. Will default to preserving the dimension of the queries if not specified.
+        v_channels (`int`, *optional*):
+            Dimension to project the values before applying attention in the cross-attention and self-attention layers
+            of the encoder. Will default to preserving the dimension of the queries if not specified.
+        ** DEPRECATED ** cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
+            Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
+        ** DEPRECATED ** self_attention_widening_factor (`int`, *optional*, defaults to 1):
+            Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
+        cross_attention_widening_factor (`int`, *optional*, defaults to 1):
+            Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_type (`str`, *optional*, defaults to `'layer_norm'`):
+            The type of layer normalization to use. Can be one of {'layer_norm', 'rms_norm'}.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        sca_use_query_residual (`bool`, *optional*, defaults to `True`):
+            Whether to add a query residual in the spectral cross attention (SCA) layer of the encoder.
+        use_query_residual (`float`, *optional*, defaults to `True`):
+            Whether to add a query residual in the cross-attention layer of the encoder.
+        position_encoding_type (`str`, *optional*, defaults to `'trainable'`):
+            Type of position encoding to use. Can be one of {'trainable', 'alibi', 'alibit', 'rope', None}.
+        num_max_positions (`int`, *optional*, defaults to 331):
+            Maximum number of positions to use for the position encoding.
+        vocab_size (`int`, *optional*, defaults to 262):
+            Vocabulary size for the masked language modeling model.
+        attention_to_channel (`bool`, defaults to `False`):
+            Whether SCA should attend to the channel dimension. If False, attention to frequency bin dimension.
+        ff_layer_type (`str`, *optional*, defaults to `'mlp'`):
+            Type of feed-forward layer to use. Can be one of {'mlp', 'moe'}.
+        ff_widening_factor (`int`, *optional*, defaults to 1):
+            Widening factor for the feed-forward layers in the MLP/MoE.
+        moe_num_experts (`int`, *optional*, defaults to 4):
+            Number of experts to use in the mixture of experts (MoE) feed-forward layer.
+            Only used if `ff_layer_type` is set to `'moe'`.
+        moe_topk (`int`, *optional*, defaults to 2):
+            Number of top experts to use in the mixture of experts (MoE) feed-forward layer.
+            Only used if `ff_layer_type` is set to `'moe'`.
+        rope_type_sca (`str`, *optional*, defaults to `pixel`): Can be one of {'l'|lang', 'p'|'pixel', None}.
+            RoPE index type for SCA. Only used if `position_encoding_type` is set to `rope`.
+        rope_type_latent (`str`, *optional*, defaults to `pixel`): Can be one of {'l'|'lang', 'p'|'pixel', None}.
+            RoPE index type for Latent Transformer. Only used if `position_encoding_type` is set to `'rope'`.
+        rope_type_temporal (`str`, *optional*, defaults to `lang`): Can be one of {'l'|'lang', 'p'|'pixel', None}.
+            RoPE index type for Temporal Transformer. Only used if `position_encoding_type` is set to `'rope'`.
+        rope_apply_to_keys (`bool`, *optional*, defaults to `False`): Whether to apply RoPE to the keys in the
+            self/cross-attention layers. Only used if `position_encoding_type` is set to `'rope'`.
+        rope_partial_pe (`bool`, *optional*, defaults to `False`): Whether to use partial RoPE in the self/cross-attention.
+            Only used if `position_encoding_type` is set to `'rope'`.
+        rope_trainable (`bool`, *optional*, defaults to `False`): Whether to make the RoPE trainable. Only used if
+    Example:
+    ```python
+    >>> from model.perceiver_mod import PerceiverTFEncodel, PerceiverTFConfig
+    >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
+    >>> configuration = PerceiverTFConfig()
+    >>> # Initializing a model from the deepmind/language-perceiver style configuration
+    >>> model = PerceiverTFEncoder(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "perceivertf"
+    def __init__(
+        self,
+        num_latents=24,
+        d_latents=128,
+        d_model=128,
+        kv_dim=128,
+        num_blocks=3,
+        num_self_attention_heads=8,
+        num_cross_attention_heads=8,
+        num_local_transformers_per_block=2,
+        num_temporal_transformers_per_block=2,
+        qk_channels=128,
+        v_channels=128,
+        cross_attention_shape_for_attention="q",
+        # self_attention_widening_factor=1, ** DEPRECATED **
+        # cross_attention_widening_factor=1, ** DEPRECATED **
+        hidden_act="gelu",
+        dropout_rate=0.1,
+        initializer_range=0.02,
+        layer_norm_type="layer_norm",
+        layer_norm_eps=1e-5,
+        sca_use_query_residual=True,
+        use_query_residual=True,
+        position_encoding_type="trainable",
+        num_max_positions=330,
+        vocab_size=1391,
+        attention_to_channel=False,
+        ff_layer_type="mlp",
+        ff_widening_factor=1,
+        moe_num_experts=4,
+        moe_topk=2,
+        rope_type_sca="pixel",
+        rope_type_latent="pixel",
+        rope_type_temporal="lang",
+        rope_apply_to_keys=False,
+        rope_partial_pe=False,
+        rope_trainable=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_latents = num_latents
+        self.d_latents = d_latents
+        self.d_model = d_model
+        self.kv_dim = kv_dim
+        self.qk_channels = qk_channels
+        self.v_channels = v_channels
+        self.num_blocks = num_blocks
+        self.num_self_attention_heads = num_self_attention_heads
+        self.num_cross_attention_heads = num_cross_attention_heads
+        self.num_local_transformers_per_block = num_local_transformers_per_block
+        self.num_temporal_transformers_per_block = num_temporal_transformers_per_block
+        self.sca_use_query_residual = sca_use_query_residual
+        self.use_query_residual = use_query_residual
+        self.position_encoding_type = position_encoding_type
+        self.num_max_positions = num_max_positions
+        # self.self_attention_widening_factor = self_attention_widening_factor
+        # self.cross_attention_widening_factor = cross_attention_widening_factor
+        self.cross_attention_shape_for_attention = cross_attention_shape_for_attention
+        self.attention_to_channel = attention_to_channel
+        self.ff_layer_type = ff_layer_type
+        self.ff_widening_factor = ff_widening_factor
+        self.moe_num_experts = moe_num_experts
+        self.moe_topk = moe_topk
+        self.rope_type_sca = rope_type_sca
+        self.rope_type_latent = rope_type_latent
+        self.rope_type_temporal = rope_type_temporal
+        self.rope_apply_to_keys = rope_apply_to_keys
+        self.rope_partial_pe = rope_partial_pe
+        self.rope_trainable = rope_trainable
+        self.hidden_act = hidden_act
+        self.dropout_rate = dropout_rate
+        self.initializer_range = initializer_range
+        self.layer_norm_type = layer_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        # masked language modeling attributes
+        self.vocab_size = vocab_size
+class PerceiverTFPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = PerceiverTFConfig
+    base_model_prefix = "perceivertf"
+    main_input_name = "inputs"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif hasattr(module, "latents"):
+            module.latents.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif hasattr(module, "_pos_emb") and isinstance(module._pos_emb, nn.Parameter):
+            # initialize PerceiverTFTrainablePE
+            module._pos_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif hasattr(module, "_pos_emb_temporal"):
+            # initialize PerceiverTFTrainablePE
+            module._pos_emb_temporal.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif hasattr(module, "slopes") and isinstance(module.slopes, nn.Parameter):
+            # initialize AlibiPositionalBias
+            module.reset_parameters()
+        elif isinstance(module, nn.ParameterDict):
+            for modality in module.keys():
+                module[modality].data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        # elif hasattr(module, "position_embeddings") and isinstance(
+        #         module, PerceiverTrainablePositionEncoding):
+        #     module.position_embeddings.data.normal_(mean=0.0, std=self.config.initializer_range)
+# Replace the 'ModelOutputWithCrossAttentions' with 'MoEModelOutputWithCrossAttentions' for MoE
+@dataclass
+class MoEModelOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+    Plus, router_probs for Mixture of Experts models.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+            Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss and the z_loss for Mixture of Experts models.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    router_logits: Optional[Tuple[torch.FloatTensor]] = None