pere commited on Nov 7, 2024

Commit

864d4c5

verified ·

1 Parent(s): 97e3033

Saving best state, step 500, val wer 96.036

Browse files

Files changed (47) hide show

.gitignore +1 -0
added_tokens.json +1611 -0
checkpoint-500-epoch-0-val-wer-96.036/added_tokens.json +1611 -0
checkpoint-500-epoch-0-val-wer-96.036/config.json +285 -0
checkpoint-500-epoch-0-val-wer-96.036/generation_config.json +256 -0
checkpoint-500-epoch-0-val-wer-96.036/merges.txt +0 -0
checkpoint-500-epoch-0-val-wer-96.036/model.safetensors +3 -0
checkpoint-500-epoch-0-val-wer-96.036/model_1.safetensors +3 -0
checkpoint-500-epoch-0-val-wer-96.036/optimizer.bin +3 -0
checkpoint-500-epoch-0-val-wer-96.036/preprocessor_config.json +14 -0
checkpoint-500-epoch-0-val-wer-96.036/random_states_0.pkl +3 -0
checkpoint-500-epoch-0-val-wer-96.036/scheduler.bin +3 -0
checkpoint-500-epoch-0-val-wer-96.036/special_tokens_map.json +139 -0
checkpoint-500-epoch-0-val-wer-96.036/tokenizer.json +0 -0
checkpoint-500-epoch-0-val-wer-96.036/tokenizer_config.json +0 -0
checkpoint-500-epoch-0-val-wer-96.036/vocab.json +0 -0
config.json +285 -0
create_student_model.py +231 -0
distil-whisper/events.out.tfevents.1730988960.a100-80-west4a.48904.0 +3 -0
distil-whisper/events.out.tfevents.1730989066.a100-80-west4a.49408.0 +3 -0
distil-whisper/events.out.tfevents.1730989452.a100-80-west4a.68077.0 +3 -0
distil-whisper/events.out.tfevents.1730990001.a100-80-west4a.87125.0 +3 -0
distil_whisper/__init__.py +21 -0
distil_whisper/layers.py +1338 -0
distil_whisper/modeling_flax_whisper.py +2135 -0
distil_whisper/partitioner.py +965 -0
distil_whisper/pipeline.py +527 -0
distil_whisper/train_state.py +118 -0
generation_config.json +256 -0
merges.txt +0 -0
model.safetensors +3 -0
nb-distil-large-init/added_tokens.json +1611 -0
nb-distil-large-init/config.json +285 -0
nb-distil-large-init/generation_config.json +256 -0
nb-distil-large-init/merges.txt +0 -0
nb-distil-large-init/model.safetensors +3 -0
nb-distil-large-init/preprocessor_config.json +14 -0
nb-distil-large-init/special_tokens_map.json +139 -0
nb-distil-large-init/tokenizer_config.json +0 -0
nb-distil-large-init/vocab.json +0 -0
preprocessor_config.json +14 -0
run_distillation.py +1827 -0
run_large_training.sh +41 -0
special_tokens_map.json +139 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ wandb

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

checkpoint-500-epoch-0-val-wer-96.036/added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

checkpoint-500-epoch-0-val-wer-96.036/config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "_name_or_path": "NbAiLab/nb-whisper-large",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0,
+  "begin_suppress_tokens": null,
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": null,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

checkpoint-500-epoch-0-val-wer-96.036/generation_config.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "no",
+  "max_initial_timestamp_index": 1,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task": "transcribe",
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.46.2"
+}

checkpoint-500-epoch-0-val-wer-96.036/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500-epoch-0-val-wer-96.036/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dceef1c98c82eee48a3a948d1ca88682b48946a66a0d989d6be8c1c49205bed
+size 3025686376

checkpoint-500-epoch-0-val-wer-96.036/model_1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28897ec4b789c0dc382a6975366fcb16206be64b6b691a60b218831c8f6af1ea
+size 4361070048

checkpoint-500-epoch-0-val-wer-96.036/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aadb2de09477d862c183c69ec3806328585b886a2e268366d2ef3cbfccb89257
+size 950951226

checkpoint-500-epoch-0-val-wer-96.036/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

checkpoint-500-epoch-0-val-wer-96.036/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f618092df3bd36d40b256e361e5acec6a0f94cbd96621ff64d347a801af7f553
+size 14408

checkpoint-500-epoch-0-val-wer-96.036/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2627ccec0bb9a51b7d9d753a9441035aec88f305994eb3b5ccbb3e0571f519d6
+size 1064

checkpoint-500-epoch-0-val-wer-96.036/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-500-epoch-0-val-wer-96.036/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500-epoch-0-val-wer-96.036/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-500-epoch-0-val-wer-96.036/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "_name_or_path": "./nb-distil-large-init",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0,
+  "begin_suppress_tokens": null,
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": null,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

create_student_model.py ADDED Viewed

	@@ -0,0 +1,231 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Initialise a student Whisper model from a pre-trained teacher model for
+teacher-student distillation.
+"""
+import argparse
+import copy
+import logging
+import numpy as np
+import torch
+from transformers import GenerationConfig, WhisperForConditionalGeneration, WhisperProcessor
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Initialise a student Whisper model from a teacher model, copying the relevant layer weights and adjusting the processor as necessary."
+    )
+    parser.add_argument(
+        "--teacher_checkpoint",
+        type=str,
+        required=True,
+        help="The HF Hub ID of the teacher checkpoint.",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="",
+        help="In case the relevant teacher weights are located inside a subfolder of the model repo on huggingface.co, you "
+        "can specify the folder name here.",
+    )
+    parser.add_argument(
+        "--encoder_layers",
+        type=int,
+        default=None,
+        help="Number of encoder layers to use in the student model. Defaults to all layers from the teacher.",
+    )
+    parser.add_argument(
+        "--decoder_layers",
+        type=int,
+        default=2,
+        help="Number of decoder layers to use in the student model. Defaults to 2 layers.",
+    )
+    parser.add_argument(
+        "--decoder_layers_numbers",
+        type=int,
+        nargs="*",
+        help="Layers numbers of the decoder teacher to use in the student model. Defaults to None, equivalent to taking first and last layer (and equivalent to `--decoder_layers_numbers 0 -1`).",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        required=True,
+        help="Where to save the student weights and processor.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        type=bool,
+        required=False,
+        default=False,
+        help="Whether to push the student weights and processor to the Hub.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Where to store the pretrained models downloaded from huggingface.co",
+    )
+    args = parser.parse_args()
+    return args
+def init_student_model_from_teacher(
+    teacher_checkpoint,
+    encoder_layers=None,
+    decoder_layers=2,
+    decoder_layers_numbers=None,
+    save_dir=None,
+    push_to_hub=None,
+    cache_dir=None,
+    subfolder="",
+):
+    if decoder_layers_numbers is not None and len(decoder_layers_numbers) != decoder_layers:
+        raise ValueError(
+            f"Got {len(decoder_layers_numbers)} layers number for {decoder_layers} decoder layers."
+        )
+    teacher_model = WhisperForConditionalGeneration.from_pretrained(
+        teacher_checkpoint,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+        low_cpu_mem_usage=True,
+    )
+    processor = WhisperProcessor.from_pretrained(teacher_checkpoint)
+    generation_config = GenerationConfig.from_pretrained(teacher_checkpoint)
+    generation_config.forced_decoder_ids = None
+    teacher_config = teacher_model.config
+    teacher_encoder_layers = teacher_config.encoder_layers
+    teacher_decoder_layers = teacher_config.decoder_layers
+    student_config = copy.deepcopy(teacher_config)
+    student_config.update(
+        {
+            "encoder_layers": encoder_layers if encoder_layers is not None else teacher_encoder_layers,
+            "decoder_layers": decoder_layers,
+        }
+    )
+    encoder_mapping = np.linspace(0, teacher_encoder_layers - 1, student_config.encoder_layers, dtype=int)
+    encoder_mapping[-1] = teacher_encoder_layers - 1
+    encoder_map = {}
+    for student_layer, teacher_layer in enumerate(encoder_mapping):
+        encoder_map[teacher_layer] = student_layer
+    if decoder_layers_numbers is None:
+        decoder_mapping = np.linspace(0, teacher_decoder_layers - 1, student_config.decoder_layers, dtype=int)
+        decoder_mapping[-1] = teacher_decoder_layers - 1
+    else:
+        decoder_mapping = decoder_layers_numbers
+    decoder_map = {}
+    for student_layer, teacher_layer in enumerate(decoder_mapping):
+        decoder_map[teacher_layer] = student_layer
+    # init the student params from the teacher model
+    student_model = WhisperForConditionalGeneration(student_config)
+    missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
+    if len(missing_keys) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+            f"Missing key(s) in state_dict: {missing_keys}"
+        )
+    if decoder_layers == teacher_decoder_layers:
+        decoder_keys = [key for key in unexpected_keys if "model.decoder.layers" in key]
+        if len(decoder_keys) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+                f"Unexpected key(s) in state_dict: {decoder_keys}"
+            )
+    if encoder_layers == teacher_encoder_layers:
+        encoder_keys = [key for key in unexpected_keys if "model.encoder.layers" in key]
+        if len(encoder_keys) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+                f"Unexpected key(s) in state_dict: {encoder_keys}"
+            )
+    for layer in range(teacher_decoder_layers):
+        if layer in decoder_map:
+            # re-introduce pre-defined layers from the teacher
+            student_model.model.decoder.layers[decoder_map[layer]].load_state_dict(
+                teacher_model.model.decoder.layers[layer].state_dict()
+            )
+    if encoder_layers is not None:
+        for layer in range(teacher_encoder_layers):
+            if layer in encoder_map:
+                # re-introduce pre-defined layers from the teacher
+                student_model.model.encoder.layers[encoder_map[layer]].load_state_dict(
+                    teacher_model.model.encoder.layers[layer].state_dict()
+                )
+    # remove the teacher params and model
+    del teacher_model
+    # save the converted weights and model
+    if save_dir is not None:
+        student_model.save_pretrained(save_dir)
+        # we also need to correctly save the processor and generation config
+        processor.save_pretrained(save_dir)
+        generation_config.save_pretrained(save_dir)
+    # check we can do a forward pass with the saved model - first load the weights and processor
+    logger.info("Checking we can load the saved model...")
+    student_model = WhisperForConditionalGeneration.from_pretrained(
+        save_dir,
+        low_cpu_mem_usage=True,
+    )
+    processor = WhisperProcessor.from_pretrained(save_dir)
+    # define some random inputs
+    input_features = processor(np.ones(16000), sampling_rate=16000, return_tensors="pt").input_features
+    decoder_start_token_id = student_model.config.decoder_start_token_id
+    decoder_input_ids = torch.ones((input_features.shape[0], 1), dtype=torch.long) * decoder_start_token_id
+    # do a forward pass - outputs will be gibberish for the initialised model so we can't check them
+    # but we make can sure the model runs as expected
+    logger.info("Checking we can run the converted model forward...")
+    _ = student_model(input_features, decoder_input_ids=decoder_input_ids).logits
+    logger.info("Conversion successful!")
+    if push_to_hub:
+        student_model.push_to_hub(save_dir)
+        processor.push_to_hub(save_dir)
+        generation_config.push_to_hub(save_dir)
+if __name__ == "__main__":
+    args = parse_args()
+    init_student_model_from_teacher(
+        teacher_checkpoint=args.teacher_checkpoint,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        decoder_layers_numbers=args.decoder_layers_numbers,
+        save_dir=args.save_dir,
+        push_to_hub=args.push_to_hub,
+        cache_dir=args.cache_dir,
+        subfolder=args.subfolder,
+    )

distil-whisper/events.out.tfevents.1730988960.a100-80-west4a.48904.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e58b3108b1d79614ead26bc71b9b79ba8d077ea73b93f222724dde951c1e8ab6
+size 88

distil-whisper/events.out.tfevents.1730989066.a100-80-west4a.49408.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:102f2dfcdad2383784b10a2967b71981fae5a614ad250eec9e83968e1215469e
+size 88

distil-whisper/events.out.tfevents.1730989452.a100-80-west4a.68077.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd082b899310b318f1ae0d551d3be510323bf1da0270e0c3e78d7ccdecd4d696
+size 88

distil-whisper/events.out.tfevents.1730990001.a100-80-west4a.87125.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b9c1f6dc20114fde3decac69cde54d7a3b8d982cae625be16f3c2c2aafe78e9
+size 1055

distil_whisper/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+from .partitioner import PjitPartitioner
+from .pipeline import FlaxWhisperPipeline
+from .train_state import InferenceState

distil_whisper/layers.py ADDED Viewed

	@@ -0,0 +1,1338 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense attention classes and mask/weighting functions."""
+# pylint: disable=attribute-defined-outside-init,g-bare-generic
+import dataclasses
+import functools
+import operator
+from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Union
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax import linen as nn
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.dtypes import promote_dtype
+from jax import lax, random
+# from flax.linen.partitioning import param_with_axes, with_sharding_constraint
+param_with_axes = nn_partitioning.param_with_axes
+with_sharding_constraint = nn_partitioning.with_sharding_constraint
+# Type annotations
+Array = jnp.ndarray
+DType = jnp.dtype
+PRNGKey = jnp.ndarray
+Shape = Iterable[int]
+Activation = Callable[..., Array]
+PrecisionLike = Union[None, str, lax.Precision, Tuple[str, str], Tuple[lax.Precision, lax.Precision]]
+DotGeneralT = Callable[..., Array]
+ConvGeneralDilatedT = Callable[..., Array]
+PaddingLike = Union[str, int, Sequence[Union[int, Tuple[int, int]]]]
+LaxPadding = Union[str, Sequence[Tuple[int, int]]]
+# Parameter initializers.
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+InitializerAxis = Union[int, Tuple[int, ...]]
+NdInitializer = Callable[[PRNGKey, Shape, DType, InitializerAxis, InitializerAxis], Array]
+default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
+# ------------------------------------------------------------------------------
+# Temporary inlined JAX N-d initializer code
+# TODO(levskaya): remove once new JAX release is out.
+# ------------------------------------------------------------------------------
+def _compute_fans(shape: jax.core.NamedShape, in_axis=-2, out_axis=-1):
+    """Inlined JAX `nn.initializer._compute_fans`."""
+    if isinstance(in_axis, int):
+        in_size = shape[in_axis]
+    else:
+        in_size = int(np.prod([shape[i] for i in in_axis]))
+    if isinstance(out_axis, int):
+        out_size = shape[out_axis]
+    else:
+        out_size = int(np.prod([shape[i] for i in out_axis]))
+    receptive_field_size = shape.total / in_size / out_size
+    fan_in = in_size * receptive_field_size
+    fan_out = out_size * receptive_field_size
+    return fan_in, fan_out
+def variance_scaling(scale, mode, distribution, in_axis=-2, out_axis=-1, dtype=jnp.float_):
+    """Inlined JAX `nn.initializer.variance_scaling`."""
+    def init(key, shape, dtype=dtype):
+        return jnp.zeros(shape, dtype=dtype)
+        dtype = jax.dtypes.canonicalize_dtype(dtype)
+        shape = jax.core.as_named_shape(shape)
+        fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+        if mode == "fan_in":
+            denominator = fan_in
+        elif mode == "fan_out":
+            denominator = fan_out
+        elif mode == "fan_avg":
+            denominator = (fan_in + fan_out) / 2
+        else:
+            raise ValueError("invalid mode for variance scaling initializer: {}".format(mode))
+        variance = jnp.array(scale / denominator, dtype=dtype)
+        if distribution == "truncated_normal":
+            # constant is stddev of standard normal truncated to (-2, 2)
+            stddev = jnp.sqrt(variance) / jnp.array(0.87962566103423978, dtype)
+            return random.truncated_normal(key, -2, 2, shape, dtype) * stddev
+        elif distribution == "normal":
+            return random.normal(key, shape, dtype) * jnp.sqrt(variance)
+        elif distribution == "uniform":
+            return random.uniform(key, shape, dtype, -1) * jnp.sqrt(3 * variance)
+        else:
+            raise ValueError("invalid distribution for variance scaling initializer: {}".format(distribution))
+    return init
+# ------------------------------------------------------------------------------
+def nd_dense_init(scale, mode, distribution):
+    """Initializer with in_axis, out_axis set at call time."""
+    def init_fn(key, shape, dtype, in_axis, out_axis):
+        fn = variance_scaling(scale, mode, distribution, in_axis, out_axis)
+        return fn(key, shape, dtype)
+    return init_fn
+def dot_product_attention(
+    query: Array,
+    key: Array,
+    value: Array,
+    bias: Optional[Array] = None,
+    dropout_rng: Optional[PRNGKey] = None,
+    dropout_rate: float = 0.0,
+    deterministic: bool = False,
+    dtype: DType = jnp.float32,
+    float32_logits: bool = False,
+):
+    """Computes dot-product attention given query, key, and value.
+    This is the core function for applying attention based on
+    https://arxiv.org/abs/1706.03762. It calculates the attention weights given
+    query and key and combines the values using the attention weights.
+    Args:
+      query: queries for calculating attention with shape of `[batch, q_length,
+        num_heads, qk_depth_per_head]`.
+      key: keys for calculating attention with shape of `[batch, kv_length,
+        num_heads, qk_depth_per_head]`.
+      value: values to be used in attention with shape of `[batch, kv_length,
+        num_heads, v_depth_per_head]`.
+      bias: bias for the attention weights. This should be broadcastable to the
+        shape `[batch, num_heads, q_length, kv_length]` This can be used for
+        incorporating causal masks, padding masks, proximity bias, etc.
+      dropout_rng: JAX PRNGKey: to be used for dropout
+      dropout_rate: dropout rate
+      deterministic: bool, deterministic or not (to apply dropout)
+      dtype: the dtype of the computation (default: float32)
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    Returns:
+      Output of shape `[batch, length, num_heads, v_depth_per_head]`.
+    """
+    assert key.ndim == query.ndim == value.ndim, "q, k, v must have same rank."
+    assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], "q, k, v batch dims must match."
+    assert query.shape[-2] == key.shape[-2] == value.shape[-2], "q, k, v num_heads must match."
+    assert key.shape[-3] == value.shape[-3], "k, v lengths must match."
+    assert query.shape[-1] == key.shape[-1], "q, k depths must match."
+    # Casting logits and softmax computation for float32 for model stability.
+    if float32_logits:
+        query = query.astype(jnp.float32)
+        key = key.astype(jnp.float32)
+    # `attn_weights`: [batch, num_heads, q_length, kv_length]
+    attn_weights = jnp.einsum("bqhd,bkhd->bhqk", query, key)
+    # Apply attention bias: masking, dropout, proximity bias, etc.
+    if bias is not None:
+        attn_weights = attn_weights + bias.astype(attn_weights.dtype)
+    # Normalize the attention weights across `kv_length` dimension.
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+    # Apply attention dropout.
+    if not deterministic and dropout_rate > 0.0:
+        keep_prob = 1.0 - dropout_rate
+        # T5 broadcasts along the "length" dim, but unclear which one that
+        # corresponds to in positional dimensions here, assuming query dim.
+        dropout_shape = list(attn_weights.shape)
+        dropout_shape[-2] = 1
+        keep = random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+        keep = jnp.broadcast_to(keep, attn_weights.shape)
+        multiplier = keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)
+        attn_weights = attn_weights * multiplier
+    # Take the linear combination of `value`.
+    return jnp.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+dynamic_vector_slice_in_dim = jax.vmap(lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None))
+class MultiHeadDotProductAttention(nn.Module):
+    """Multi-head dot-product attention.
+    Attributes:
+      num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
+        should be divisible by the number of heads.
+      head_dim: dimension of each head.
+      dtype: the dtype of the computation.
+      dropout_rate: dropout rate
+      kernel_init: initializer for the kernel of the Dense layers.
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    """
+    num_heads: int
+    head_dim: int
+    dtype: DType = jnp.float32
+    dropout_rate: float = 0.0
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    float32_logits: bool = False  # computes logits in float32 for stability.
+    @nn.compact
+    def __call__(
+        self,
+        inputs_q: Array,
+        inputs_kv: Array,
+        mask: Optional[Array] = None,
+        bias: Optional[Array] = None,
+        *,
+        decode: bool = False,
+        deterministic: bool = False,
+    ) -> Array:
+        """Applies multi-head dot product attention on the input data.
+        Projects the inputs into multi-headed query, key, and value vectors,
+        applies dot-product attention and project the results to an output vector.
+        There are two modes: decoding and non-decoding (e.g., training). The mode is
+        determined by `decode` argument. For decoding, this method is called twice,
+        first to initialize the cache and then for an actual decoding process. The
+        two calls are differentiated by the presence of 'cached_key' in the variable
+        dict. In the cache initialization stage, the cache variables are initialized
+        as zeros and will be filled in the subsequent decoding process.
+        In the cache initialization call, `inputs_q` has a shape [batch, length,
+        q_features] and `inputs_kv`: [batch, length, kv_features]. During the
+        incremental decoding stage, query, key and value all have the shape [batch,
+        1, qkv_features] corresponding to a single step.
+        Args:
+          inputs_q: input queries of shape `[batch, q_length, q_features]`.
+          inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
+          mask: attention mask of shape `[batch, num_heads, q_length, kv_length]`.
+          bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
+          decode: Whether to prepare and use an autoregressive cache.
+          deterministic: Disables dropout if set to True.
+        Returns:
+          output of shape `[batch, length, q_features]`.
+        """
+        projection = functools.partial(
+            DenseGeneral,
+            axis=-1,
+            features=(self.num_heads, self.head_dim),
+            kernel_axes=("embed", "heads", "kv"),
+            dtype=self.dtype,
+        )
+        # NOTE: T5 does not explicitly rescale the attention logits by
+        #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+        #       linear transformations, which is equivalent under Adafactor.
+        depth_scaling = jnp.sqrt(self.head_dim).astype(self.dtype)
+        def query_init(*args):
+            return self.kernel_init(*args) / depth_scaling
+        # Project inputs_q to multi-headed q/k/v
+        # dimensions are then [batch, length, num_heads, head_dim]
+        query = projection(kernel_init=query_init, name="query")(inputs_q)
+        key = projection(kernel_init=self.kernel_init, name="key")(inputs_kv)
+        value = projection(kernel_init=self.kernel_init, name="value")(inputs_kv)
+        query = with_sharding_constraint(query, ("batch", "length", "heads", "kv"))
+        key = with_sharding_constraint(key, ("batch", "length", "heads", "kv"))
+        value = with_sharding_constraint(value, ("batch", "length", "heads", "kv"))
+        if decode:
+            # Detect if we're initializing by absence of existing cache data.
+            is_initialized = self.has_variable("cache", "cached_key")
+            # The key and value have dimension [batch, length, num_heads, head_dim],
+            # but we cache them as [batch, num_heads, head_dim, length] as a TPU
+            # fusion optimization. This also enables the "scatter via one-hot
+            # broadcast" trick, which means we do a one-hot broadcast instead of a
+            # scatter/gather operations, resulting in a 3-4x speedup in practice.
+            def swap_dims(x):
+                return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+            cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+            cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+            cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+            if is_initialized:
+                batch, num_heads, head_dim, length = cached_key.value.shape
+                # During fast autoregressive decoding, we feed one position at a time,
+                # and cache the keys and values step by step.
+                # Sanity shape check of cached key against input query.
+                expected_shape = (batch, 1, num_heads, head_dim)
+                if expected_shape != query.shape:
+                    raise ValueError(
+                        "Autoregressive cache shape error, "
+                        "expected query shape %s instead got %s." % (expected_shape, query.shape)
+                    )
+                # Create a OHE of the current index. NOTE: the index is increased below.
+                cur_index = cache_index.value
+                one_hot_indices = jax.nn.one_hot(cur_index, length, dtype=key.dtype)
+                # In order to update the key, value caches with the current key and
+                # value, we move the length axis to the back, similar to what we did for
+                # the cached ones above.
+                # Note these are currently the key and value of a single position, since
+                # we feed one position at a time.
+                one_token_key = jnp.moveaxis(key, -3, -1)
+                one_token_value = jnp.moveaxis(value, -3, -1)
+                # Update key, value caches with our new 1d spatial slices.
+                # We implement an efficient scatter into the cache via one-hot
+                # broadcast and addition.
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+                cached_key.value = key
+                cached_value.value = value
+                cache_index.value = cache_index.value + 1
+                # Move the keys and values back to their original shapes.
+                key = jnp.moveaxis(key, -1, -3)
+                value = jnp.moveaxis(value, -1, -3)
+                # Causal mask for cached decoder self-attention: our single query
+                # position should only attend to those key positions that have already
+                # been generated and cached, not the remaining zero elements.
+                mask = combine_masks(
+                    mask,
+                    jnp.broadcast_to(
+                        jnp.arange(length) <= cur_index,
+                        # (1, 1, length) represent (head dim, query length, key length)
+                        # query length is 1 because during decoding we deal with one
+                        # index.
+                        # The same mask is applied to all batch elements and heads.
+                        (batch, 1, 1, length),
+                    ),
+                )
+                # Grab the correct relative attention bias during decoding. This is
+                # only required during single step decoding.
+                if bias is not None:
+                    # The bias is a full attention matrix, but during decoding we only
+                    # have to take a slice of it.
+                    # This is equivalent to bias[..., cur_index:cur_index+1, :].
+                    bias = dynamic_vector_slice_in_dim(jnp.squeeze(bias, axis=0), jnp.reshape(cur_index, (-1)), 1, -2)
+        # Convert the boolean attention mask to an attention bias.
+        if mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                mask > 0,
+                jnp.full(mask.shape, 0.0).astype(self.dtype),
+                jnp.full(mask.shape, -1e10).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        # Add provided bias term (e.g. relative position embedding).
+        if bias is not None:
+            attention_bias = combine_biases(attention_bias, bias)
+        dropout_rng = None
+        if not deterministic and self.dropout_rate > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # Apply attention.
+        x = dot_product_attention(
+            query,
+            key,
+            value,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout_rate,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            float32_logits=self.float32_logits,
+        )
+        # Back to the original inputs dimensions.
+        out = DenseGeneral(
+            features=inputs_q.shape[-1],  # output dim is set to the input dim.
+            axis=(-2, -1),
+            kernel_init=self.kernel_init,
+            kernel_axes=("heads", "kv", "embed"),
+            dtype=self.dtype,
+            name="out",
+        )(x)
+        return out
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+    # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+    return tuple([ax if ax >= 0 else ndim + ax for ax in axes])
+def _canonicalize_tuple(x):
+    if isinstance(x, Iterable):
+        return tuple(x)
+    else:
+        return (x,)
+# ------------------------------------------------------------------------------
+# DenseGeneral for attention layers.
+# ------------------------------------------------------------------------------
+class DenseGeneral(nn.Module):
+    """A linear transformation (without bias) with flexible axes.
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+    """
+    features: Union[Iterable[int], int]
+    axis: Union[Iterable[int], int] = -1
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    kernel_axes: Tuple[str, ...] = ()
+    use_bias: bool = True
+    bias_init: Any = nn.initializers.zeros
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a linear transformation to the inputs along multiple dimensions.
+        Args:
+          inputs: The nd-array to be transformed.
+        Returns:
+          The transformed input.
+        """
+        features = _canonicalize_tuple(self.features)
+        axis = _canonicalize_tuple(self.axis)
+        inputs = jnp.asarray(inputs, self.dtype)
+        axis = _normalize_axes(axis, inputs.ndim)
+        kernel_shape = tuple([inputs.shape[ax] for ax in axis]) + features
+        kernel_in_axis = np.arange(len(axis))
+        kernel_out_axis = np.arange(len(axis), len(axis) + len(features))
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            kernel_in_axis,
+            kernel_out_axis,
+            axes=self.kernel_axes,
+        )
+        if self.use_bias:
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                features,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        kernel = jnp.asarray(kernel, self.dtype)
+        contract_ind = tuple(range(0, len(axis)))
+        y = lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
+        if self.use_bias:
+            bias = jnp.asarray(bias, self.dtype)
+            # y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
+            y += jnp.reshape(bias, (1,) * (len(features) - y.ndim) + bias.shape[:])
+        return y
+def _convert_to_activation_function(fn_or_string: Union[str, Callable]) -> Callable:
+    """Convert a string to an activation function."""
+    if fn_or_string == "linear":
+        return lambda x: x
+    elif isinstance(fn_or_string, str):
+        return getattr(nn, fn_or_string)
+    elif callable(fn_or_string):
+        return fn_or_string
+    else:
+        raise ValueError("don't know how to convert %s to an activation function" % (fn_or_string,))
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block.
+    Attributes:
+      intermediate_dim: Shared dimension of hidden layers.
+      activations: Type of activations for each layer.  Each element is either
+        'linear', a string function name in flax.linen, or a function.
+      kernel_init: Kernel function, passed to the dense layers.
+      deterministic: Whether the dropout layers should be deterministic.
+      intermediate_dropout_rate: Dropout rate used after the intermediate layers.
+      dtype: Type for the dense layer.
+    """
+    intermediate_dim: int = 2048
+    activations: Sequence[Union[str, Callable]] = ("relu",)
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "truncated_normal")
+    intermediate_dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    @nn.compact
+    def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
+        """Applies Transformer MlpBlock module."""
+        # Iterate over specified MLP input activation functions.
+        # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
+        activations = []
+        for idx, act_fn in enumerate(self.activations):
+            dense_name = "wi" if len(self.activations) == 1 else f"wi_{idx}"
+            x = DenseGeneral(
+                self.intermediate_dim,
+                dtype=self.dtype,
+                kernel_init=self.kernel_init,
+                kernel_axes=("embed", "mlp"),
+                name=dense_name,
+            )(inputs)
+            x = _convert_to_activation_function(act_fn)(x)
+            activations.append(x)
+        # Take elementwise product of above intermediate activations.
+        x = functools.reduce(operator.mul, activations)
+        # Apply dropout and final dense output projection.
+        x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic
+        )  # Broadcast along length.
+        x = with_sharding_constraint(x, ("batch", "length", "mlp"))
+        output = DenseGeneral(
+            inputs.shape[-1],
+            dtype=self.dtype,
+            kernel_init=self.kernel_init,
+            kernel_axes=("mlp", "embed"),
+            name="wo",
+        )(x)
+        return output
+class Embed(nn.Module):
+    """A parameterized function from integers [0, n) to d-dimensional vectors.
+    Attributes:
+      num_embeddings: number of embeddings.
+      features: number of feature dimensions for each embedding.
+      dtype: the dtype of the embedding vectors (default: float32).
+      embedding_init: embedding initializer.
+      one_hot: performs the gather with a one-hot contraction rather than a true
+        gather. This is currently needed for SPMD partitioning.
+    """
+    num_embeddings: int
+    features: int
+    cast_input_dtype: Optional[DType] = None
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    attend_dtype: Optional[DType] = None
+    embedding_init: Initializer = default_embed_init
+    one_hot: bool = True
+    embedding: Array = dataclasses.field(init=False)
+    def setup(self):
+        self.embedding = param_with_axes(
+            "embedding",
+            self.embedding_init,
+            (self.num_embeddings, self.features),
+            self.params_dtype,
+            axes=("vocab", "embed"),
+        )
+    def __call__(self, inputs: Array) -> Array:
+        """Embeds the inputs along the last dimension.
+        Args:
+          inputs: input data, all dimensions are considered batch dimensions.
+        Returns:
+          Output which is embedded input data.  The output shape follows the input,
+          with an additional `features` dimension appended.
+        """
+        if self.cast_input_dtype:
+            inputs = inputs.astype(self.cast_input_dtype)
+        if not jnp.issubdtype(inputs.dtype, jnp.integer):
+            raise ValueError("Input type must be an integer or unsigned integer.")
+        if self.one_hot:
+            iota = lax.iota(jnp.int32, self.num_embeddings)
+            one_hot = jnp.array(inputs[..., jnp.newaxis] == iota, dtype=self.dtype)
+            output = jnp.dot(one_hot, jnp.asarray(self.embedding, self.dtype))
+        else:
+            output = jnp.asarray(self.embedding, self.dtype)[inputs]
+            output = with_sharding_constraint(output, ("batch", "length", "embed"))
+        return output
+    def attend(self, query: Array) -> Array:
+        """Attend over the embedding using a query array.
+        Args:
+          query: array with last dimension equal the feature depth `features` of the
+            embedding.
+        Returns:
+          An array with final dim `num_embeddings` corresponding to the batched
+          inner-product of the array of query vectors against each embedding.
+          Commonly used for weight-sharing between embeddings and logit transform
+          in NLP models.
+        """
+        dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
+        return jnp.dot(query, jnp.asarray(self.embedding, dtype).T)
+class RelativePositionBiases(nn.Module):
+    """Adds T5-style relative positional embeddings to the attention logits.
+    Attributes:
+      num_buckets: Number of buckets to bucket distances between key and query
+        positions into.
+      max_distance: Maximum distance before everything is lumped into the last
+        distance bucket.
+      num_heads: Number of heads in the attention layer. Each head will get a
+        different relative position weighting.
+      dtype: Type of arrays through this module.
+      embedding_init: initializer for relative embedding table.
+    """
+    num_buckets: int
+    max_distance: int
+    num_heads: int
+    dtype: Any
+    embedding_init: Callable[..., Array] = nn.linear.default_embed_init
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger
+        buckets for larger absolute relative_positions.  All relative
+        positions >=max_distance  map to the same bucket.  All relative
+        positions <=-max_distance map to the same bucket.  This should allow for
+        more graceful generalization to longer sequences than the model has been
+        trained on.
+        Args:
+          relative_position: an int32 array
+          bidirectional: a boolean - whether the attention is bidirectional
+          num_buckets: an integer
+          max_distance: an integer
+        Returns:
+          a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).astype(np.int32) * num_buckets
+            n = np.abs(n)
+        else:
+            n = np.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            np.log(n.astype(np.float32) / max_exact + np.finfo(np.float32).eps)
+            / np.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(np.int32)
+        val_if_large = np.minimum(val_if_large, num_buckets - 1)
+        ret += np.where(is_small, n, val_if_large)
+        return ret
+    @nn.compact
+    def __call__(self, qlen, klen, bidirectional=True):
+        """Produce relative position embedding attention biases.
+        Args:
+          qlen: attention query length.
+          klen: attention key length.
+          bidirectional: whether to allow positive memory-query relative position
+            embeddings.
+        Returns:
+          output: `(1, len, q_len, k_len)` attention bias
+        """
+        # TODO(levskaya): should we be computing this w. numpy as a program
+        # constant?
+        context_position = np.arange(qlen, dtype=jnp.int32)[:, None]
+        memory_position = np.arange(klen, dtype=jnp.int32)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,
+            bidirectional=bidirectional,
+            num_buckets=self.num_buckets,
+            max_distance=self.max_distance,
+        )
+        relative_attention_bias = param_with_axes(
+            "rel_embedding",
+            self.embedding_init,
+            (self.num_heads, self.num_buckets),
+            jnp.float32,
+            axes=("heads", "relpos_buckets"),
+        )
+        relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
+        # Instead of using a slow gather, we create a leading-dimension one-hot
+        # array from rp_bucket and use it to perform the gather-equivalent via a
+        # contraction, i.e.:
+        # (num_head, num_buckets) x (num_buckets one-hot, qlen, klen).
+        # This is equivalent to relative_attention_bias[:, rp_bucket]
+        bcast_iota = lax.broadcasted_iota(jnp.int32, (self.num_buckets, 1, 1), 0)
+        rp_bucket_one_hot = jnp.array(rp_bucket[jnp.newaxis, ...] == bcast_iota, dtype=self.dtype)
+        # --> shape (qlen, klen, num_heads)
+        values = lax.dot_general(
+            relative_attention_bias,
+            rp_bucket_one_hot,
+            (((1,), (0,)), ((), ())),  # rhs, lhs contracting dims
+        )  # no batched dims
+        # Add a singleton batch dimension.
+        # --> shape (1, num_heads, qlen, klen)
+        return values[jnp.newaxis, ...]
+# ------------------------------------------------------------------------------
+# T5 Layernorm - no subtraction of mean or bias.
+# ------------------------------------------------------------------------------
+# class LayerNorm(nn.Module):
+#   """T5 Layer normalization operating on the last axis of the input data."""
+#   epsilon: float = 1e-6
+#   dtype: Any = jnp.float32
+#   scale_init: Initializer = nn.initializers.ones
+#   @nn.compact
+#   def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+#     """Applies layer normalization on the input."""
+#     x = jnp.asarray(x, jnp.float32)
+#     features = x.shape[-1]
+#     mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+#     y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+#     scale = param_with_axes(
+#         'scale', self.scale_init, (features,), jnp.float32, axes=('embed',))
+#     scale = jnp.asarray(scale, self.dtype)
+#     return y * scale
+class LayerNorm(nn.Module):
+    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    Operates on the last axis of the input data.
+    It normalizes the activations of the layer for each given example in a
+    batch independently, rather than across a batch like Batch Normalization.
+    i.e. applies a transformation that maintains the mean activation within
+    each example close to 0 and the activation standard deviation close to 1.
+    Attributes:
+      epsilon: A small float added to variance to avoid dividing by zero.
+      dtype: the dtype of the computation (default: float32).
+      use_bias:  If True, bias (beta) is added.
+      use_scale: If True, multiply by scale (gamma). When the next layer is linear
+        (also e.g. nn.relu), this can be disabled since the scaling will be done
+        by the next layer.
+      bias_init: Initializer for bias, by default, zero.
+      scale_init: Initializer for scale, by default, one.
+    """
+    epsilon: float = 1e-6
+    dtype: Any = jnp.float32
+    params_dtype: DType = jnp.float32
+    use_bias: bool = True
+    use_scale: bool = True
+    bias_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.zeros
+    scale_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.ones
+    @nn.compact
+    def __call__(self, x):
+        """Applies layer normalization on the input.
+        Args:
+          x: the inputs
+        Returns:
+          Normalized inputs (the same shape as inputs).
+        """
+        x = jnp.asarray(x, jnp.float32)
+        features = x.shape[-1]
+        mean = jnp.mean(x, axis=-1, keepdims=True)
+        mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+        var = mean2 - lax.square(mean)
+        mul = lax.rsqrt(var + self.epsilon)
+        if self.use_scale:
+            scale = param_with_axes(
+                "scale",
+                self.scale_init,
+                (features,),
+                self.params_dtype,
+                axes=("embed",),
+            )
+            mul = mul * jnp.asarray(scale, self.dtype)
+        y = (x - mean) * mul
+        if self.use_bias:
+            bias = param_with_axes("bias", self.bias_init, (features,), self.params_dtype, axes=("embed",))
+            y = y + jnp.asarray(bias, self.dtype)
+        return jnp.asarray(y, self.dtype)
+# ------------------------------------------------------------------------------
+# Mask-making utility functions.
+# ------------------------------------------------------------------------------
+def make_attention_mask(
+    query_input: Array,
+    key_input: Array,
+    pairwise_fn: Callable = jnp.multiply,
+    extra_batch_dims: int = 0,
+    dtype: DType = jnp.float32,
+) -> Array:
+    """Mask-making helper for attention weights.
+    In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
+    attention weights will be `[batch, heads, len_q, len_kv]` and this
+    function will produce `[batch, 1, len_q, len_kv]`.
+    Args:
+      query_input: a batched, flat input of query_length size
+      key_input: a batched, flat input of key_length size
+      pairwise_fn: broadcasting elementwise comparison function
+      extra_batch_dims: number of extra batch dims to add singleton axes for, none
+        by default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
+    """
+    # [batch, len_q, len_kv]
+    mask = pairwise_fn(
+        # [batch, len_q] -> [batch, len_q, 1]
+        jnp.expand_dims(query_input, axis=-1),
+        # [batch, len_q] -> [batch, 1, len_kv]
+        jnp.expand_dims(key_input, axis=-2),
+    )
+    # [batch, 1, len_q, len_kv]. This creates the head dim.
+    mask = jnp.expand_dims(mask, axis=-3)
+    mask = jnp.expand_dims(mask, axis=tuple(range(extra_batch_dims)))
+    return mask.astype(dtype)
+def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
+    """Make a causal mask for self-attention.
+    In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
+    will be `[batch, heads, len, len]` and this function will produce a
+    causal mask of shape `[batch, 1, len, len]`.
+    Note that a causal mask does not depend on the values of x; it only depends on
+    the shape. If x has padding elements, they will not be treated in a special
+    manner.
+    Args:
+      x: input array of shape `[batch, len]`
+      extra_batch_dims: number of batch dims to add singleton axes for, none by
+        default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len, len]` shaped causal mask for 1d attention.
+    """
+    idxs = jnp.broadcast_to(jnp.arange(x.shape[-1], dtype=jnp.int32), x.shape)
+    return make_attention_mask(idxs, idxs, jnp.greater_equal, extra_batch_dims=extra_batch_dims, dtype=dtype)
+def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
+    """Combine attention masks.
+    Args:
+      *masks: set of attention mask arguments to combine, some can be None.
+      dtype: final mask dtype
+    Returns:
+      Combined mask, reduced by logical and, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = jnp.logical_and(mask, other_mask)
+    return mask.astype(dtype)
+def combine_biases(*masks: Optional[Array]):
+    """Combine attention biases.
+    Args:
+      *masks: set of attention bias arguments to combine, some can be None.
+    Returns:
+      Combined mask, reduced by summation, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = mask + other_mask
+    return mask
+def make_decoder_mask(
+    decoder_target_tokens: Array,
+    dtype: DType,
+    decoder_causal_attention: Optional[Array] = None,
+    decoder_segment_ids: Optional[Array] = None,
+) -> Array:
+    """Compute the self-attention mask for a decoder.
+    Decoder mask is formed by combining a causal mask, a padding mask and an
+    optional packing mask. If decoder_causal_attention is passed, it makes the
+    masking non-causal for positions that have value of 1.
+    A prefix LM is applied to a dataset which has a notion of "inputs" and
+    "targets", e.g., a machine translation task. The inputs and targets are
+    concatenated to form a new target. `decoder_target_tokens` is the concatenated
+    decoder output tokens.
+    The "inputs" portion of the concatenated sequence can attend to other "inputs"
+    tokens even for those at a later time steps. In order to control this
+    behavior, `decoder_causal_attention` is necessary. This is a binary mask with
+    a value of 1 indicating that the position belonged to "inputs" portion of the
+    original dataset.
+    Example:
+      Suppose we have a dataset with two examples.
+      ds = [{"inputs": [6, 7], "targets": [8]},
+            {"inputs": [3, 4], "targets": [5]}]
+      After the data preprocessing with packing, the two examples are packed into
+      one example with the following three fields (some fields are skipped for
+      simplicity).
+         decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+           decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+      decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
+      where each array has [batch, length] shape with batch size being 1. Then,
+      this function computes the following mask.
+                        mask = [[[[1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 1, 0, 0, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0]]]]
+      mask[b, 1, :, :] represents the mask for the example `b` in the batch.
+      Because mask is for a self-attention layer, the mask's shape is a square of
+      shape [query length, key length].
+      mask[b, 1, i, j] = 1 means that the query token at position i can attend to
+      the key token at position j.
+    Args:
+      decoder_target_tokens: decoder output tokens. [batch, length]
+      dtype: dtype of the output mask.
+      decoder_causal_attention: a binary mask indicating which position should
+        only attend to earlier positions in the sequence. Others will attend
+        bidirectionally. [batch, length]
+      decoder_segment_ids: decoder segmentation info for packed examples. [batch,
+        length]
+    Returns:
+      the combined decoder mask.
+    """
+    masks = []
+    # The same mask is applied to all attention heads. So the head dimension is 1,
+    # i.e., the mask will be broadcast along the heads dim.
+    # [batch, 1, length, length]
+    causal_mask = make_causal_mask(decoder_target_tokens, dtype=dtype)
+    # Positions with value 1 in `decoder_causal_attneition` can attend
+    # bidirectionally.
+    if decoder_causal_attention is not None:
+        # [batch, 1, length, length]
+        inputs_mask = make_attention_mask(
+            decoder_causal_attention,
+            decoder_causal_attention,
+            jnp.logical_and,
+            dtype=dtype,
+        )
+        masks.append(jnp.logical_or(causal_mask, inputs_mask).astype(dtype))
+    else:
+        masks.append(causal_mask)
+    # Padding mask.
+    masks.append(make_attention_mask(decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=dtype))
+    # Packing mask
+    if decoder_segment_ids is not None:
+        masks.append(make_attention_mask(decoder_segment_ids, decoder_segment_ids, jnp.equal, dtype=dtype))
+    return combine_masks(*masks, dtype=dtype)
+def canonicalize_padding(padding: PaddingLike, rank: int) -> LaxPadding:
+    """ "Canonicalizes conv padding to a jax.lax supported format."""
+    if isinstance(padding, str):
+        return padding
+    if isinstance(padding, int):
+        return [(padding, padding)] * rank
+    if isinstance(padding, Sequence) and len(padding) == rank:
+        new_pad = []
+        for p in padding:
+            if isinstance(p, int):
+                new_pad.append((p, p))
+            elif isinstance(p, tuple) and len(p) == 2:
+                new_pad.append(p)
+            else:
+                break
+        if len(new_pad) == rank:
+            return new_pad
+    raise ValueError(
+        f"Invalid padding format: {padding}, should be str, int,"
+        f" or a sequence of len {rank} where each element is an"
+        " int or pair of ints."
+    )
+def _conv_dimension_numbers(input_shape):
+    """Computes the dimension numbers based on the input shape."""
+    ndim = len(input_shape)
+    lhs_spec = (0, ndim - 1) + tuple(range(1, ndim - 1))
+    rhs_spec = (ndim - 1, ndim - 2) + tuple(range(0, ndim - 2))
+    out_spec = lhs_spec
+    return lax.ConvDimensionNumbers(lhs_spec, rhs_spec, out_spec)
+class _Conv(nn.Module):
+    """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    features: int
+    kernel_size: Sequence[int]
+    strides: Union[None, int, Sequence[int]] = 1
+    padding: PaddingLike = "SAME"
+    input_dilation: Union[None, int, Sequence[int]] = 1
+    kernel_dilation: Union[None, int, Sequence[int]] = 1
+    feature_group_count: int = 1
+    use_bias: bool = True
+    mask: Optional[Array] = None
+    dtype: Optional[DType] = None
+    params_dtype: DType = jnp.float32
+    precision: PrecisionLike = None
+    kernel_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.lecun_normal()
+    bias_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.zeros
+    conv_general_dilated: ConvGeneralDilatedT = lax.conv_general_dilated
+    kernel_axes: Tuple[str, ...] = ()
+    @property
+    def shared_weights(self) -> bool:  # type: ignore
+        """Defines whether weights are shared or not between different pixels.
+        Returns:
+          `True` to use shared weights in convolution (regular convolution).
+          `False` to use different weights at different pixels, a.k.a.
+          "locally connected layer", "unshared convolution", or "local convolution".
+        """
+        ...
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a (potentially unshared) convolution to the inputs.
+        Args:
+          inputs: input data with dimensions (*batch_dims, spatial_dims...,
+            features). This is the channels-last convention, i.e. NHWC for a 2d
+            convolution and NDHWC for a 3D convolution. Note: this is different from
+            the input convention used by `lax.conv_general_dilated`, which puts the
+            spatial dimensions last.
+            Note: If the input has more than 1 batch dimension, all batch dimensions
+            are flattened into a single dimension for the convolution and restored
+            before returning.  In some cases directly vmap'ing the layer may yield
+            better performance than this default flattening approach.  If the input
+            lacks a batch dimension it will be added for the convolution and removed
+            n return, an allowance made to enable writing single-example code.
+        Returns:
+          The convolved data.
+        """
+        if isinstance(self.kernel_size, int):
+            raise TypeError(
+                "Expected Conv kernel_size to be a"
+                " tuple/list of integers (eg.: [3, 3]) but got"
+                f" {self.kernel_size}."
+            )
+        else:
+            kernel_size = tuple(self.kernel_size)
+        def maybe_broadcast(x: Optional[Union[int, Sequence[int]]]) -> Tuple[int, ...]:
+            if x is None:
+                # backward compatibility with using None as sentinel for
+                # broadcast 1
+                x = 1
+            if isinstance(x, int):
+                return (x,) * len(kernel_size)
+            return tuple(x)
+        # Combine all input batch dimensions into a single leading batch axis.
+        num_batch_dimensions = inputs.ndim - (len(kernel_size) + 1)
+        if num_batch_dimensions != 1:
+            input_batch_shape = inputs.shape[:num_batch_dimensions]
+            total_batch_size = int(np.prod(input_batch_shape))
+            flat_input_shape = (total_batch_size,) + inputs.shape[num_batch_dimensions:]
+            inputs = jnp.reshape(inputs, flat_input_shape)
+        # self.strides or (1,) * (inputs.ndim - 2)
+        strides = maybe_broadcast(self.strides)
+        input_dilation = maybe_broadcast(self.input_dilation)
+        kernel_dilation = maybe_broadcast(self.kernel_dilation)
+        padding_lax = canonicalize_padding(self.padding, len(kernel_size))
+        if padding_lax == "CIRCULAR":
+            kernel_size_dilated = [(k - 1) * d + 1 for k, d in zip(kernel_size, kernel_dilation)]
+            zero_pad: List[Tuple[int, int]] = [(0, 0)]
+            pads = zero_pad + [((k - 1) // 2, k // 2) for k in kernel_size_dilated] + [(0, 0)]
+            inputs = jnp.pad(inputs, pads, mode="wrap")
+            padding_lax = "VALID"
+        elif padding_lax == "CAUSAL":
+            if len(kernel_size) != 1:
+                raise ValueError("Causal padding is only implemented for 1D convolutions.")
+            left_pad = kernel_dilation[0] * (kernel_size[0] - 1)
+            pads = [(0, 0), (left_pad, 0), (0, 0)]
+            inputs = jnp.pad(inputs, pads)
+            padding_lax = "VALID"
+        dimension_numbers = _conv_dimension_numbers(inputs.shape)
+        in_features = jnp.shape(inputs)[-1]
+        if self.shared_weights:
+            # One shared convolutional kernel for all pixels in the output.
+            assert in_features % self.feature_group_count == 0
+            kernel_shape = kernel_size + (
+                in_features // self.feature_group_count,
+                self.features,
+            )
+        else:
+            if self.feature_group_count != 1:
+                raise NotImplementedError(
+                    "`lax.conv_general_dilated_local` does not support "
+                    f"`feature_group_count != 1`, got `{self.feature_group_count}`."
+                )
+            # Need to know the spatial output shape of a standard convolution to
+            # create the unshared convolution kernel.
+            conv_output_shape = jax.eval_shape(
+                lambda lhs, rhs: self.conv_general_dilated(  # pylint: disable=g-long-lambda
+                    lhs=lhs,
+                    rhs=rhs,
+                    window_strides=strides,
+                    padding=padding_lax,
+                    dimension_numbers=dimension_numbers,
+                    lhs_dilation=input_dilation,
+                    rhs_dilation=kernel_dilation,
+                ),
+                inputs,
+                jax.ShapedArray(kernel_size + (in_features, self.features), inputs.dtype),
+            ).shape
+            # One (unshared) convolutional kernel per each pixel in the output.
+            kernel_shape = conv_output_shape[1:-1] + (
+                np.prod(kernel_size) * in_features,
+                self.features,
+            )
+        if self.mask is not None and self.mask.shape != kernel_shape:
+            raise ValueError(
+                "Mask needs to have the same shape as weights. " f"Shapes are: {self.mask.shape}, {kernel_shape}"
+            )
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            axes=self.kernel_axes,
+        )
+        if self.mask is not None:
+            kernel *= self.mask
+        if self.use_bias:
+            if self.shared_weights:
+                # One bias weight per output channel, shared between pixels.
+                bias_shape = (self.features,)
+            else:
+                # One bias weight per output entry, unshared betwen pixels.
+                bias_shape = conv_output_shape[1:]
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                bias_shape,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        else:
+            bias = None
+        inputs, kernel, bias = promote_dtype(inputs, kernel, bias, dtype=self.dtype)
+        if self.shared_weights:
+            y = self.conv_general_dilated(
+                inputs,
+                kernel,
+                strides,
+                padding_lax,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                feature_group_count=self.feature_group_count,
+                precision=self.precision,
+            )
+        else:
+            y = lax.conv_general_dilated_local(
+                lhs=inputs,
+                rhs=kernel,
+                window_strides=strides,
+                padding=padding_lax,
+                filter_shape=kernel_size,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                precision=self.precision,
+            )
+        if self.use_bias:
+            bias = bias.reshape((1,) * (y.ndim - bias.ndim) + bias.shape)
+            y += bias
+        if num_batch_dimensions != 1:
+            output_shape = input_batch_shape + y.shape[1:]
+            y = jnp.reshape(y, output_shape)
+        return y
+class Conv(_Conv):
+    """Convolution Module wrapping `lax.conv_general_dilated`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    @property
+    def shared_weights(self) -> bool:
+        return True

distil_whisper/modeling_flax_whisper.py ADDED Viewed

	@@ -0,0 +1,2135 @@

+# coding=utf-8
+# Copyright 2023 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax whisper model."""
+import random
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.linen.partitioning import remat, scan_with_axes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+from transformers import WhisperConfig
+from transformers.generation.flax_logits_process import (
+    FlaxLogitsProcessor,
+    FlaxLogitsProcessorList,
+    FlaxWhisperTimeStampLogitsProcessor,
+)
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .layers import Conv, DenseGeneral, Embed, LayerNorm, with_sharding_constraint
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxStaticForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index. This is a static version of the `transformers` logit
+    processor [`FlaxForceTokensLogitsProcessor`] that is compatible with sharded forced tokens.
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+    def __init__(self, force_token_map):
+        # The generic `transformers` logit processor builds `force_token_array` as a dictionary - this is not a valid
+        # JAX type, and so we switch to using a JAX array instead
+        force_token_map = jnp.array(force_token_map)
+        # Converts the array of format [[index, token]] containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced. For XLA compatibility,
+        # indexes without forced tokens will have a negative value. Note that the last token we ever need to force in
+        # Whisper is at position 3, so we only construct an array up to this index. The native version constructs a tensor
+        # dynamically according to the length of the `force_token_map`. Array shapes need to be concrete for XLA compatibility,
+        # so this is not permitted here.
+        force_token_array = jnp.ones(3, dtype=jnp.int32) * -1
+        for index, token in force_token_map:
+            force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads (got `embed_dim`:"
+                f" {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            DenseGeneral,
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "joined_kv"),
+        )
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = DenseGeneral(
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("joined_kv", "embed"),
+            use_bias=self.bias,
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"),
+                dtype="bool",
+            )
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        query_states = with_sharding_constraint(query_states, ("batch", "length", "heads", "kv"))
+        key_states = with_sharding_constraint(key_states, ("batch", "length", "heads", "kv"))
+        value_states = with_sharding_constraint(value_states, ("batch", "length", "heads", "kv"))
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                # max_length of cached_key is last dim
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[-1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        # The following code is largely copied from: https://github.com/google-research/t5x/blob/63d9addf628c6d8c547a407a32095fcb527bb20b/t5x/examples/scalable_t5/layers.py#L280-L284
+        is_initialized = self.has_variable("cache", "cached_key")
+        # The key and value have dimension [batch_size, seq_length, num_heads, head_dim],
+        # but we cache them as [batch_size, num_heads, head_dim, seq_length] as a TPU
+        # fusion optimization. This also enables the "scatter via one-hot
+        # broadcast" trick, which means we do a one-hot broadcast instead of a
+        # scatter/gather operations, resulting in a 3-4x speedup in practice.
+        def swap_dims(x):
+            return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            batch_size, num_heads, head_dim, seq_length = cached_key.value.shape
+            # During fast autoregressive decoding, we feed one position at a time,
+            # and cache the keys and values step by step.
+            # Sanity shape check of cached key against input query.
+            num_updated_cache_vectors = query.shape[1]
+            expected_shape = (batch_size, 1, num_heads, head_dim)
+            if num_updated_cache_vectors == 1 and expected_shape != query.shape:
+                raise ValueError(
+                    "Autoregressive cache shape error, expected query shape"
+                    f" {expected_shape} instead got {query.shape}"
+                )
+            # Create a OHE of the current index. NOTE: the index is increased below.
+            cur_index = cache_index.value
+            # In order to update the key, value caches with the current key and
+            # value, we move the seq_length axis to the back, similar to what we did for
+            # the cached ones above.
+            # Note these are currently the key and value of a single position, since
+            # we feed one position at a time.
+            one_token_key = jnp.moveaxis(key, -3, -1)
+            one_token_value = jnp.moveaxis(value, -3, -1)
+            # Update key, value caches with our new 1d spatial slices.
+            # We implement an efficient scatter into the cache via one-hot
+            # broadcast and addition.
+            if num_updated_cache_vectors > 1:
+                indices = jnp.eye(num_updated_cache_vectors, seq_length)[None, None]
+                key = cached_key.value + jnp.matmul(one_token_key, indices)
+                value = cached_value.value + jnp.matmul(one_token_value, indices)
+            else:
+                one_hot_indices = jax.nn.one_hot(cur_index, seq_length, dtype=key.dtype)
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+            cached_key.value = key
+            cached_value.value = value
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # Move the keys and values back to their original shapes.
+            key = jnp.moveaxis(key, -1, -3)
+            value = jnp.moveaxis(value, -1, -3)
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(seq_length) < cur_index + num_updated_cache_vectors,
+                (batch_size,) + (1, num_updated_cache_vectors, seq_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = DenseGeneral(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layernorm_output = self.self_attn_layer_norm(hidden_states)
+        layernorm_output = with_sharding_constraint(layernorm_output, ("batch", "length", "embed"))
+        attn_output, attn_weights = self.self_attn(hidden_states=layernorm_output, attention_mask=attention_mask)
+        attn_output = self.dropout_layer(attn_output, deterministic=deterministic)
+        attn_output = residual + attn_output
+        attn_output = with_sharding_constraint(attn_output, ("batch", "length", "embed"))
+        residual = attn_output
+        post_layer_norm = self.final_layer_norm(attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        FlaxWhisperEncoderCheckpointLayer = (
+            remat(
+                FlaxWhisperEncoderLayer,
+                static_argnums=(2, 3),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperEncoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            # nicest behaviour for scan is to let the compiler figure out the correct shapes for the hidden states
+            # so we'll just pass an empty tuple as the carry initializer and hold on to the first hidden states for later
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperEncoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.encoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxEncoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                output_attentions,
+                deterministic,
+                all_hidden_states,  # tuple intializer (or None if not using output_hidden_states)
+            )
+            # remove the scan dimension
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.encoder_layers):
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.encoder_layerdrop):  # skip the layer
+                    layer_outputs = (None, None)
+                else:
+                    layer_outputs = FlaxWhisperEncoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.fc1 = DenseGeneral(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layer_norm_output = self.self_attn_layer_norm(hidden_states)
+        layer_norm_output = with_sharding_constraint(layer_norm_output, ("batch", "length", "embed"))
+        # Self Attention
+        self_attn_output, self_attn_weights = self.self_attn(
+            hidden_states=layer_norm_output,
+            attention_mask=attention_mask,
+            init_cache=init_cache,
+        )
+        self_attn_output = self.dropout_layer(self_attn_output, deterministic=deterministic)
+        self_attn_output = residual + self_attn_output
+        self_attn_output = with_sharding_constraint(self_attn_output, ("batch", "length", "embed"))
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = self_attn_output
+            encoder_layer_norm_output = self.encoder_attn_layer_norm(self_attn_output)
+            encoder_layer_norm_output = with_sharding_constraint(
+                encoder_layer_norm_output, ("batch", "length", "embed")
+            )
+            cross_attn_output, cross_attn_weights = self.encoder_attn(
+                hidden_states=encoder_layer_norm_output,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            cross_attn_output = self.dropout_layer(cross_attn_output, deterministic=deterministic)
+            cross_attn_output = residual + cross_attn_output
+            cross_attn_output = with_sharding_constraint(cross_attn_output, ("batch", "length", "embed"))
+        # Fully Connected
+        residual = cross_attn_output
+        post_layer_norm = self.final_layer_norm(cross_attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        FlaxWhisperDecoderCheckpointLayer = (
+            remat(
+                FlaxWhisperDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperDecoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperDecoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.decoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxDecoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                output_attentions,
+                deterministic,
+                all_hidden_states,
+            )
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.decoder_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                    # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.decoder_layerdrop):
+                    layer_outputs = (None, None, None)
+                else:
+                    layer_outputs = FlaxWhisperDecoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        init_cache,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = [
+            hidden_states,
+            all_hidden_states,
+            all_self_attns,
+            all_cross_attentions,
+        ]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.conv1 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "num_mel", "embed"),
+        )
+        self.conv2 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "embed", "num_mel"),
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.embed_positions = Embed(
+            self.config.max_source_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (
+            self.config.num_mel_bins,
+            self.config.max_source_positions * 2,
+        ):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                " self.config.max_source_positions * 2) (got"
+                f" {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins},"
+                f" {self.config.max_source_positions * 2}))"
+            )
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "embed", "num_mel"))
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        # sinusoidal positional embeddings should not be trained
+        embed_positions = jax.lax.stop_gradient(embed_positions)
+        hidden_states = hidden_states + embed_positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.embed_tokens = Embed(
+            self.config.vocab_size,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.embed_positions = Embed(
+            self.config.max_target_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layers = FlaxWhisperDecoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-5, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.decoder = FlaxWhisperDecoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        encoder_hidden_states = encoder_outputs[0]
+        if freeze_encoder:
+            encoder_hidden_states = jax.lax.stop_gradient(encoder_hidden_states)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def _get_encoder_module(self):
+        return self.encoder
+    def _get_decoder_module(self):
+        return self.decoder
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int, int, int] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        params_dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        # Can only use_scan=True in init if loading scanned weights -> need to handle use_scan=True and unrolled weights
+        use_scan: bool = False,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        self.use_scan = use_scan
+        self.gradient_checkpointing = gradient_checkpointing
+        module = self.module_class(
+            config=config,
+            dtype=dtype,
+            params_dtype=params_dtype,
+            use_scan=use_scan,
+            gradient_checkpointing=gradient_checkpointing,
+            **kwargs,
+        )
+        if input_shape is None:
+            input_shape = (1, config.num_mel_bins, 2 * config.max_source_positions)
+        super().__init__(
+            config,
+            module,
+            input_shape=input_shape,
+            seed=seed,
+            dtype=dtype,
+            _do_init=_do_init,
+        )
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def enable_scan(self):
+        self.use_scan = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_unroll_to_scan(self.params)
+    def disable_scan(self):
+        self.use_scan = False
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_scan_to_unroll(self.params)
+    def convert_unroll_to_scan(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of unrolled model parameters to a scanned block of model parameters. This method can be used
+        to explicitly convert the model parameters to scanned format. This returns a new `params` tree and does not
+        convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The unrolled structure for the query
+        projection params is as follows:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', '23', 'self_attn', 'q_proj')
+        This method takes each of the `q_proj` matrices for layers (0, ..., 23) and stacks them into a single 'super'
+        matrix, giving a *single* block of weights for all 24 layers compatible with the scanned model:
+            ('bert', 'encoder', 'layer', 'ScanLayers', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "unrolled" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `layer` in their key
+            if "layers/0" in k:
+                if "decoder" in k:
+                    block_prefix = "Decoder"
+                    num_hidden_layers = self.config.decoder_layers
+                else:
+                    block_prefix = "Encoder"
+                    num_hidden_layers = self.config.encoder_layers
+                # Squash the keys for the N unrolled layers into one single key:
+                # (layer/0, ..., layer/N) -> layer/FlaxScanLayers
+                scan_key = k.replace("0", f"Flax{block_prefix}ScanLayers")
+                stacked_params = []
+                # Iterate over the unrolled layers (1,...,N)
+                for i in range(num_hidden_layers):
+                    # Stack the params for the N layers into one super block
+                    # and remove the unrolled layer params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_layer = params.pop(k.replace("0", str(i)))
+                    stacked_params.append(unrolled_layer)
+                params[scan_key] = jnp.stack(stacked_params)
+        # Finally, unflatten the dict to restore the nested pytree structure
+        params = unflatten_dict(params, sep="/")
+        return params
+    def convert_scan_to_unroll(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of scanned model parameters to an unrolled stack of model parameters. This method can be
+        used to explicitly convert the model parameters to unrolled format. This returns a new `params` tree and does
+        not convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The scanned structure for the query
+        projection (`q_proj`) params is a single, stacked matrix of parameters over all N layers:
+            ('bert', 'encoder', 'layer', 'FlaxScanLayers', 'self_attn', 'q_proj')
+        This method slices each layer of the `q_proj` scanned matrix into single, standalone layers, and replaces the
+        scanned matrix of parameteres on the fly:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', 'N', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "scan" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `FlaxScanLayers` in their key
+            if "FlaxEncoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.encoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxEncoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+            elif "FlaxDecoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.decoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxDecoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+        params = unflatten_dict(params, sep="/")
+        return params
+    # Copied from transformers.models.whisper.modeling_flax_whisper.FlaxWhisperPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]),
+            decoder_input_ids.shape,
+        )
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=FlaxBaseModelOutputWithPastAndCrossAttentions,
+        config_class=WhisperConfig,
+    )
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        freeze_encoder: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+@add_start_docstrings(
+    ("The bare Whisper Model transformer outputting raw hidden-states without any specific head on top."),
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    module_class = FlaxWhisperModule
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(
+            config=self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = DenseGeneral(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "vocab"),
+        )
+    def _get_encoder_module(self):
+        return self.model.encoder
+    def _get_decoder_module(self):
+        return self.model.decoder
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+            return lm_logits, outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+        if task is not None:
+            generation_config.task = task
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+        if language is not None:
+            generation_config.language = language
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+        forced_decoder_ids = []
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def pipeline_generate(
+        self,
+        input_features,
+        forced_decoder_ids,
+        return_timestamps=False,
+        generation_config=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        # override the generation config forced decoder ids in preference of the ones we have set
+        generation_config.forced_decoder_ids = None
+        logits_processor = FlaxLogitsProcessorList()
+        logits_processor.append(FlaxStaticForceTokensLogitsProcessor(forced_decoder_ids))
+        if hasattr(generation_config, "return_timestamps") and return_timestamps:
+            logits_processor.append(FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, 1))
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+    Transcription example:
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration,
+    WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration,
+    output_type=FlaxSeq2SeqLMOutput,
+    config_class=_CONFIG_FOR_DOC,
+)

distil_whisper/partitioner.py ADDED Viewed

	@@ -0,0 +1,965 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for partitioning."""
+import abc
+import collections
+import dataclasses
+import typing
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+import cached_property
+import jax
+import numpy as np
+from absl import logging
+from flax import traverse_util
+from flax.linen import partitioning as flax_partitioning
+from jax import numpy as jnp
+from jax import random
+from jax.experimental import multihost_utils
+from jax.experimental.mesh_utils import create_hybrid_device_mesh
+from jax.experimental.pjit import pjit as jax_pjit
+from jax.sharding import Mesh, PartitionSpec
+JaxDevice = Any
+TpuMesh = Tuple[int, int, int, int]  # (x, y, z, num_cores).
+OtherMesh = Tuple[int, int]
+HardwareMesh = Union[TpuMesh, OtherMesh]
+PyTreeDef = type(jax.tree_util.tree_structure(None))
+TrainState = Any
+LogicalAxisRules = Sequence[Tuple[str, Optional[str]]]
+if typing.TYPE_CHECKING:  # See b/163639353
+    cached_property = property  # pylint: disable=invalid-name
+else:
+    cached_property = cached_property.cached_property
+class AxisNames(tuple):
+    """Tuple of strings specifying name for each axis.
+    We create a separate class for this so JAX's pytree utilities can distinguish
+    it from a tuple that should be treated as a pytree, instead treating it as a
+    leaf.
+    """
+    def __new__(cls, *names):
+        return tuple.__new__(AxisNames, names)
+    def __repr__(self):
+        return "AxisNames%s" % tuple.__repr__(self)
+# pjit wrappers for cpu fallback.
+# ----------------------------------------------------------------------------
+# TODO(levskaya): This function is now no different than jax_pjit, but callers
+# currently depend on `backend` argument
+def pjit(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit."""
+    del backend
+    return jax_pjit(
+        fun,
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums=static_argnums,
+        donate_argnums=donate_argnums,
+    )
+# pjit wrappers for cpu fallback.
+# -----------------------------------------------------------------------------
+# TODO(levskaya): upstream this fallback behavior to jax pjit.
+def pjit_with_cpu_fallback(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit that calls normal jit on cpu."""
+    if jax.devices(backend)[0].platform == "cpu":
+        return jax.jit(fun, static_argnums=static_argnums, donate_argnums=donate_argnums)
+    else:
+        return jax_pjit(
+            fun,
+            in_axis_resources,
+            out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+        )
+def with_sharding_constraint(x, axis_resources):
+    """Wrapper for pjit with_sharding_constraint, no-op on cpu or outside pjit."""
+    if jax.devices()[0].platform == "cpu" or not global_mesh_defined():
+        return x
+    else:
+        return jax.experimental.pjit.with_sharding_constraint(x, axis_resources)
+# pjit Mesh creation functions.
+# -----------------------------------------------------------------------------
+def bounds_from_last_device(last_device: JaxDevice) -> HardwareMesh:
+    """Get the bound from the given last device."""
+    # Must be passed the device at the highest-coordinate corner of the
+    # relevant mesh, which is a requirement we know is satisfied by the last
+    # device in jax.devices().
+    if hasattr(last_device, "coords"):
+        x, y, z = last_device.coords
+        return x + 1, y + 1, z + 1, last_device.core_on_chip + 1
+    else:
+        # On non-TPU platforms, the "mesh" is hosts x devices per host in order
+        # to take advantage of faster within-host interconnect.
+        return jax.host_count(), jax.local_device_count()
+def get_coords(device: JaxDevice) -> HardwareMesh:
+    """Returns the coordinates of the given device."""
+    if hasattr(device, "coords"):
+        return (*device.coords, device.core_on_chip)
+    return (device.process_index, device.id % jax.local_device_count())
+def global_mesh_defined():
+    """Checks if global xmap/pjit mesh resource environment is defined."""
+    maps_env = jax.experimental.maps.thread_resources.env
+    return maps_env.physical_mesh.devices.shape != ()  # pylint: disable=g-explicit-bool-comparison
+def get_mesh(
+    model_parallel_submesh: HardwareMesh,
+    input_devices: Sequence[JaxDevice] = (),
+    input_local_devices: Sequence[JaxDevice] = (),
+    tile_by_host_if_needed: bool = True,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Construct an xmap/pjit Mesh for the given model-parallel submesh.
+    The resulting mesh has two resource axes: 'model', with the provided submesh
+    shape, and 'data', which covers the rest of the mesh.
+    Args:
+      model_parallel_submesh: a HardwareMesh spec, namely (x,y,z,core) on TPU for
+        a single model-parallel replica's "tile" in the physical device mesh. The
+        first three elements (`x`, `y`, and `z`) should be factors of the pod
+        slice; e.g., if you are using df_4x8, then `x` should be a factor of 4
+        (one of 1, 2, 4), `y` should be a factor of 8 (one of 1, 2, 4, 8), and `z`
+        must be 1, because TPU v3 slices are only 2D. `z` can be >1 for TPU v4
+        (and maybe later TPUs) that allow 3D slices. `core` is the number of cores
+        to use from each TPU node. As communication is usually fastest inside the
+        same node, if you need a tile of more than 1 core, then
+        you should first increase `core`: e.g., for TPU v3, (1,1,1,2) is better
+          than (2,1,1,1). To pick a good spec, try a few possible values until you
+          get high TPU utilization.
+      input_devices: the devices to use, will use jax.devices() if this is not
+        set.
+      input_local_devices: the local devices to use, will use jax.local_devices()
+        if this is not set.
+      tile_by_host_if_needed: JAX currently requires that the parts of any sharded
+        array that are located on one host's local devices form a single
+        contiguous slice. A best effort will be made to achieve this without
+        "tiling" the device assignment over hosts (which can reduce XLA collective
+        performance). If this flag is True, then the device assignment will be
+        tiled over hosts if necessary to satisfy this constraint and create a
+        buildable mesh; if false, mesh construction will fail instead.
+      backend: get devices from the pinned backend, if specified. This is
+        useful for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      A xmap / pjit Mesh containing the virtual device mesh with data, model axes.
+    """
+    input_devices = input_devices or jax.devices(backend)
+    input_local_devices = input_local_devices or jax.local_devices(0, backend)
+    # Sort input_devices based on coords, as backends might not return devices
+    # in order.
+    last_device = sorted(input_devices, key=get_coords)[-1]
+    last_input_local_devices = sorted(input_local_devices, key=get_coords)[-1]
+    logging.info(
+        "last device coords : %r\nlast local device coords: %r",
+        get_coords(last_device),
+        get_coords(last_input_local_devices),
+    )
+    global_hardware_mesh = bounds_from_last_device(last_device)
+    mesh_ndim = len(global_hardware_mesh)
+    local_hardware_mesh = bounds_from_last_device(last_input_local_devices)
+    mesh_err = (
+        f"each dimension of the model parallel submesh {model_parallel_submesh} "
+        "must be a factor of the corresponding dimension of the global device "
+        f"mesh {global_hardware_mesh}"
+    )
+    assert not any(g % m for g, m in zip(global_hardware_mesh, model_parallel_submesh)), mesh_err
+    assert not any(g % l for g, l in zip(global_hardware_mesh, local_hardware_mesh))
+    devices = np.empty(global_hardware_mesh, dtype=object)
+    for device in input_devices:
+        device_coords = get_coords(device)
+        devices[device_coords] = device
+    tile_by_host = tile_by_host_if_needed
+    if len(global_hardware_mesh) == 4:
+        # enable contiguous local chunks without host tiling by making Z major
+        global_hardware_mesh = typing.cast(Tuple[int, int, int, int], global_hardware_mesh)
+        model_parallel_submesh = typing.cast(Tuple[int, int, int, int], model_parallel_submesh)
+        gx, gy, gz, gc = global_hardware_mesh
+        mx, my, mz, mc = model_parallel_submesh
+        if (mx == gx > 1 and my == mz == 1) or (mx == 1 and my == gy > 1 and mz == gz > 1):
+            logging.info("ensuring YZ plane has a Z-major device order")
+            # YZ should be ZY
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gx, gz, gy, gc
+            model_parallel_submesh = mx, mz, my, mc
+            devices = devices.swapaxes(1, 2)
+            tile_by_host = False
+        if (my == gy > 1 and mx == mz == 1) or (my == 1 and mx == gx > 1 and mz == gz > 1):
+            logging.info("ensuring XZ plane has a Z-major device order")
+            # XZ should be ZX
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gz, gy, gx, gc
+            model_parallel_submesh = mz, my, mx, mc
+            devices = devices.swapaxes(0, 2)
+            tile_by_host = False
+    if tile_by_host:
+        logging.warning(
+            "Tiling device assignment mesh by hosts, which may lead to "
+            "reduced XLA collective performance. To avoid this, modify "
+            "the model parallel submesh or run with more tasks per host."
+        )
+        tile_err = (
+            "to tile the mesh by hosts, each dimension of the model parallel "
+            "submesh must be either a factor or a multiple of the corresponding "
+            "dimension of the per-host submesh"
+        )
+        def dh_dd_mh_md(g: int, m: int, l: int) -> Tuple[int, int, int, int]:
+            """Split a global mesh dimension into four tiling components.
+            Args:
+              g: global mesh bounds dimension size
+              m: model-parallel submesh bounds dimension size
+              l: local submesh bounds dimension size
+            Returns:
+              The resulting tuple divides the dimension into the hosts component of
+              the data-parallel submesh, the devices component of the data-parallel
+              submesh, the hosts component of the model-parallel submesh, and the
+              devices component of the model-parallel submesh.
+            """
+            d = g // m
+            if m >= l:
+                assert not m % l, tile_err
+                return (d, 1, m // l, l)
+            else:
+                assert not l % m, tile_err
+                return (d // (l // m), l // m, 1, m)
+        # e.g. [(x_data_hosts, x_data_devs, x_model_hosts, x_model_devs), ...]
+        dh_dd_mh_md_tups = map(
+            dh_dd_mh_md,
+            global_hardware_mesh,
+            model_parallel_submesh,
+            local_hardware_mesh,
+        )
+        # reshape to e.g. (x_dh, x_dd, x_mh, x_md, y_dh, ...)
+        devices = devices.reshape(*(s for t in dh_dd_mh_md_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder local subgroups for ring locality
+        # Transpose to [data_host], [data_device], [model_host], [model_device]
+        # block ordering e.g. (x_dh, y_dh, ..., x_dd, y_dd, ...)
+        devices = devices.transpose(
+            *(4 * i for i in range(mesh_ndim)),
+            *(4 * i + 1 for i in range(mesh_ndim)),
+            *(4 * i + 2 for i in range(mesh_ndim)),
+            *(4 * i + 3 for i in range(mesh_ndim)),
+        )
+    else:
+        # e.g. [(x_data, x_model), (y_data, y_model), ...]
+        model_data_tups = [(g // m, m) for g, m in zip(global_hardware_mesh, model_parallel_submesh)]
+        # reshape to e.g. (x_data, x_model, y_data, y_model...)
+        devices = devices.reshape(*(s for t in model_data_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder small subgroups for ring locality
+        # transpose to e.g. (x_data, y_data, ..., x_model, ...)
+        devices = devices.transpose(*(2 * i for i in range(mesh_ndim)), *(2 * i + 1 for i in range(mesh_ndim)))
+    # reshape to (data, model)
+    devices = devices.reshape(-1, np.prod(model_parallel_submesh))
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    logging.info("global_mesh devices shape: %s", global_mesh.devices.shape)
+    return global_mesh
+def get_cpu_mesh() -> Mesh:
+    """Trivial mesh for CPU Testing."""
+    devices = np.empty((jax.host_count(), jax.local_device_count()), dtype=object)
+    for device in jax.devices():
+        devices[device.process_index, device.id % jax.local_device_count()] = device
+    return Mesh(devices, ["data", "model"])
+def get_gpu_mesh(num_partitions: int) -> Mesh:
+    """Mesh for GPUs that preferentially places 'model' on NVLink."""
+    nvlink_size = jax.local_device_count()
+    dcn_size = jax.process_count()
+    nvlink_mp = min(num_partitions, nvlink_size)
+    nvlink_dp, extra1 = divmod(nvlink_size, nvlink_mp)
+    dcn_mp, extra2 = divmod(num_partitions, nvlink_mp)
+    assert not (
+        extra1 or extra2
+    ), "number of partitions on GPU must be a factor or multiple of the number of local devices"
+    dcn_dp = dcn_size // dcn_mp
+    devices = create_hybrid_device_mesh(
+        mesh_shape=[nvlink_dp, nvlink_mp],
+        dcn_mesh_shape=[dcn_dp, dcn_mp],
+        process_is_granule=True,
+    )
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    return global_mesh
+def default_mesh(
+    num_partitions: int,
+    model_parallel_submesh: Optional[HardwareMesh] = None,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Attempt to return a default mesh for simple cases.
+    Args:
+      num_partitions: number of partitions to use, will be ignored if
+        model_parallel_submesh is provided.
+      model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use as
+        the model-parallel device tile.
+      backend: get devices from the pinned backend, if specified. This is useful
+        for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      xmap/pjit 2D Mesh with 'data', 'model' mesh axes.
+    """
+    last_device = jax.devices(backend)[-1]
+    platform = last_device.platform
+    device_kind = last_device.device_kind
+    bounds = bounds_from_last_device(last_device)
+    if model_parallel_submesh:
+        return get_mesh(model_parallel_submesh, backend=backend)
+    if platform == "cpu":
+        return get_cpu_mesh()
+    elif platform == "gpu":
+        return get_gpu_mesh(num_partitions)
+    mps = None
+    if device_kind in ("TPU v2", "TPU v3"):
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 1, 1, 2)
+        elif num_partitions == 4:
+            mps = (2, 1, 1, 2)
+        elif num_partitions == 8:
+            mps = (2, 2, 1, 2)
+        elif num_partitions == 16:
+            mps = (4, 2, 1, 2)
+    # assume the use of megacore on TPU v4
+    elif (device_kind == "TPU v4" or device_kind == "TPU v4 lite") and bounds[3] == 1:
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 2, 1, 1)
+        elif num_partitions == 4:
+            if bounds[0] >= 4:
+                mps = (4, 1, 1, 1)
+            else:
+                mps = (2, 2, 1, 1)
+        elif num_partitions == 8:
+            if bounds[2] >= 8:
+                mps = (1, 1, 8, 1)
+            else:
+                mps = (4, 2, 1, 1)
+        elif num_partitions == 16:
+            if bounds[2] >= 16:
+                mps = (1, 1, 16, 1)
+            elif bounds[0] >= 8:
+                mps = (8, 2, 1, 1)
+            elif bounds[0] >= 4:
+                mps = (4, 4, 1, 1)
+            else:
+                mps = (2, 2, 4, 1)
+    if mps is None:
+        raise ValueError(
+            "No default mesh for this configuration: specify " "config.model_parallel_submesh explicitly."
+        )
+    return get_mesh(mps, backend=backend)
+# Data chunking helper.
+# -----------------------------------------------------------------------------
+@dataclasses.dataclass
+class LocalChunkInfo:
+    # The logical slice of an array located on this host's local devices.
+    slice: Tuple[slice, ...]
+    # A unique index for this host/local chunk among chunks with the same slice.
+    replica_id: int
+class LocalChunker:
+    """Utility class to aid chunking of sharded arrays in multihost settings."""
+    def __init__(self, global_mesh: Mesh):
+        self.global_mesh = global_mesh
+        local_mesh = global_mesh.local_mesh
+        first_local_device = local_mesh.devices.reshape(-1)[0]
+        host_location = collections.OrderedDict(
+            zip(
+                global_mesh.shape.keys(),
+                list(zip(*np.nonzero(global_mesh.devices == first_local_device)))[0],
+            )
+        )
+        self.num_chunks = collections.OrderedDict()
+        self.chunk_ids = collections.OrderedDict()
+        self.mesh_axes = list(global_mesh.shape.keys())
+        for mesh_axis in self.mesh_axes:
+            num_devices_per_chunk = local_mesh.shape[mesh_axis]
+            self.num_chunks[mesh_axis] = global_mesh.shape[mesh_axis] // num_devices_per_chunk
+            self.chunk_ids[mesh_axis] = host_location[mesh_axis] // num_devices_per_chunk
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Get the local chunk info for a given array shape and sharded axes.
+        Args:
+          global_shape: the global, unsharded shape of the array to chunk.
+          mesh_axes: a sequence of names (or None) of equal rank to `global_shape`
+            that specifies which mesh dimensions the array is sharded along.
+        Returns:
+          LocalChunkInfo containing the logical slices of the array found on this
+          host's local devices, as well as the replica index for this chunk among
+          chunks with the same slice. The latter is used to determine which
+          host should write this chunk during checkpointing.
+        """
+        local_slice = [slice(None) for dim in global_shape]
+        sharded_mesh_axes = set()
+        for i, (mesh_axis, size) in enumerate(zip(mesh_axes, global_shape)):
+            if not mesh_axis:
+                continue
+            sharded_mesh_axes.add(mesh_axis)
+            if not isinstance(mesh_axis, str):
+                raise NotImplementedError("TODO(jekbradbury)")
+            chunk_id = self.chunk_ids[mesh_axis]
+            chunk_size = size // self.num_chunks[mesh_axis]
+            local_slice[i] = slice(chunk_id * chunk_size, (chunk_id + 1) * chunk_size)
+        replicated_mesh_axes = [mesh_axis for mesh_axis in self.mesh_axes if mesh_axis not in sharded_mesh_axes]
+        replica_id = 0
+        for mesh_axis in replicated_mesh_axes:
+            chunk_id = self.chunk_ids[mesh_axis]
+            replica_id = replica_id * self.num_chunks[mesh_axis] + chunk_id
+        return LocalChunkInfo(tuple(local_slice), replica_id)
+def standard_logical_axis_rules(
+    activation_partitioning_dims: int = 1,
+    parameter_partitioning_dims: int = 1,
+    additional_rules: Optional[LogicalAxisRules] = None,
+) -> LogicalAxisRules:
+    """Default sharding rules for T5X model in terms of logical axis names.
+    Args:
+      activation_partitioning_dims: enables 2-D activation sharding when set to 2.
+      parameter_partitioning_dims: enables 2-D parameter sharding when set to 2.
+      additional_rules: additional rules (a sequence of tuples) that will be
+        appended to the standard rules.
+    Returns:
+      Sequence of logical axis rules
+    """
+    logging.info(
+        "`activation_partitioning_dims` = %d, `parameter_partitioning_dims` = %d",
+        activation_partitioning_dims,
+        parameter_partitioning_dims,
+    )
+    if activation_partitioning_dims == 1 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("embed", None),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),  # joined heads+kv dim in 2D attn param layouts
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+        ]
+    elif activation_partitioning_dims == 1 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "data"),
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+            ("embed", "data"),
+        ]
+    else:
+        raise ValueError(
+            f"`activation_partitioning_dims` = {activation_partitioning_dims} "
+            f"`parameter_partitioning_dims` = {parameter_partitioning_dims} "
+            "is not supported."
+        )
+    # Add the common rules for the replicated logical axes names.
+    replicated_rules = [
+        ("relpos_buckets", None),
+        ("abspos_buckets", None),
+        ("length", None),
+        ("layers", None),
+        ("stack", None),
+        ("mlp_activations", None),
+    ]
+    rules.extend(replicated_rules)
+    if additional_rules:
+        rules.extend(additional_rules)
+    return rules
+# NB: This needs to be top-level for the jax compilation cache.
+def _id_fn(x, ix):
+    """Identity function for copying parameters to the devices, sharded."""
+    # A pure identity such as `lambda x, *: x` can get optimized away, so we
+    # include a random.split as a cheap function that cannot be optimized away.
+    y = random.split(random.PRNGKey(jnp.array(ix, dtype=jnp.uint32)))
+    return x, y
+@dataclasses.dataclass
+class DataLayout:
+    """Represents data layout for the partitioned model."""
+    batch_size: int
+    shard_id: int
+    num_shards: int
+    is_first_host_in_replica_set: bool
+PartitionedCallable = Callable[..., Any]
+CompiledPartitionedCallable = Callable[..., Any]
+class BasePartitioner(metaclass=abc.ABCMeta):
+    """Interface for partitioning computations across hardware devices."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+    ):
+        """Configures the partitioner.
+        Args:
+          num_partitions: the number of partitions to use. Ignored if
+            `model_parallel_submesh` is provided.
+          model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use
+            as the model-parallel device tile. This submesh is used for the larger
+            of the two parameter dimensions, and, if 2-D activation sharding is
+            enabled, for the model dimension of activations. The rest of the mesh is
+            used for data parallelism and, if 2-D parameter sharding is enabled, the
+            other parameter dimension.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is useful
+            for explicitly specifying the devices other than relying on
+            jax_platform_name.
+        """
+        if not num_partitions and not model_parallel_submesh:
+            raise ValueError("At least one of `num_partitions` or " "`model_parallel_submesh` must be set.")
+        if model_parallel_submesh is not None and len(model_parallel_submesh) != 4:
+            logging.error(
+                (
+                    "`model_parallel_submesh` must be either None or a 4-tuple. Got"
+                    " `model_parallel_submesh`=%s. A ValueError will be raised"
+                    " beginning March 1, 2022."
+                ),
+                model_parallel_submesh,
+            )
+        if bool(num_partitions) and bool(model_parallel_submesh):
+            logging.error(
+                (
+                    "At most one of `num_partitions` or `model_parallel_submesh` can be"
+                    " set. Got `num_partitions=%s` and `model_parallel_submesh`=%s. A"
+                    " ValueError will be raised beginning March 21, 2022."
+                ),
+                num_partitions,
+                model_parallel_submesh,
+            )
+        self._num_partitions = num_partitions
+        self._model_parallel_submesh = model_parallel_submesh
+        self._params_on_devices = params_on_devices
+        self._data_axis = "data"
+        self._backend = backend
+    @property
+    def mesh(self) -> Mesh:
+        raise NotImplementedError
+    @property
+    def data_partition_spec(self) -> PartitionSpec:
+        return PartitionSpec(self._data_axis)
+    def get_data_layout(self, batch_size: Optional[int] = None, host_index: Optional[int] = None) -> DataLayout:
+        """Returns filled `DataLayout` based on the partitioned model layout.
+        Args:
+          batch_size: if set, indicates the requested batch size. The exception will
+            be raised if this batch size is not compatible with the layout. If not
+            set, the batch size is inferred from the layout.
+          host_index: indicates the host index to use for the calculations, if not
+            set - use JAX-provided one. Should be in [0, num_hosts) interval and the
+            order should match the order of corresponding CPU devices in
+            `jax.devices()`.
+        Returns:
+          Filled `DataLayout` structure.
+        """
+        if host_index is not None:
+            raise NotImplementedError("Explicit host_index is not yet implemented.")
+        if self._data_axis is None:
+            return DataLayout(
+                batch_size=batch_size,
+                shard_id=0,
+                num_shards=1,
+                is_first_host_in_replica_set=(jax.process_index() == 0),
+            )
+        mesh_size = self._local_chunker.global_mesh.shape[self._data_axis]
+        batch_size = batch_size or mesh_size
+        if batch_size % mesh_size:
+            raise ValueError(
+                f"Batch size ({batch_size}) must be divisible by corresponding " f"mesh size ({mesh_size})."
+            )
+        num_shards = self._local_chunker.num_chunks[self._data_axis]
+        if batch_size % num_shards:
+            raise ValueError(f"Batch size ({batch_size}) must be divisible by number of " f"replicas ({num_shards}).")
+        replica_id = self._local_chunker.get_local_chunk_info((batch_size,), [self._data_axis]).replica_id
+        return DataLayout(
+            batch_size=int(batch_size),
+            shard_id=int(self._local_chunker.chunk_ids[self._data_axis]),
+            num_shards=int(num_shards),
+            is_first_host_in_replica_set=(replica_id == 0),
+        )
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Returns the local chunk info for a given array shape and sharded axes."""
+        return self._local_chunker.get_local_chunk_info(global_shape, mesh_axes)
+    @property
+    def params_on_devices(self):
+        return self._params_on_devices
+    def move_params_to_devices(self, train_state: TrainState, train_state_axes: TrainState) -> TrainState:
+        """Moves the optimizer parameters to devices."""
+        p_id_fn = self.partition(
+            _id_fn,
+            in_axis_resources=(train_state_axes, None),
+            out_axis_resources=(train_state_axes, None),
+            donate_argnums=(0,),
+        )
+        if jax.config.jax_array and jax.process_count() > 1:
+            train_state = multihost_utils.host_local_array_to_global_array(train_state, self.mesh, train_state_axes)
+        train_state, _ = p_id_fn(train_state, jnp.ones((), dtype=jnp.uint32))
+        return train_state
+    @property
+    @abc.abstractmethod
+    def _local_chunker(self):
+        """Returns the chunker that matches the parameters of this partitioner."""
+        raise NotImplementedError
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        # By default, return None for the logical axes.
+        return train_state.restore_state(jax.tree_map(lambda x: None, train_state.state_dict()))
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PartitionedCallable:
+        """Partitions the computation using partitioner-specific implementation.
+        Args:
+          fn: the function to partition.
+          in_axis_resources: Pytree of structure matching that of arguments to `fn`,
+            with all actual arguments replaced by resource assignment
+            specifications. It is also valid to specify a pytree prefix (e.g. one
+            value in place of a whole subtree), in which case the leaves get
+            broadcast to all values in that subtree.
+            The valid resource assignment specifications are:
+              `None`: in which case the value will be replicated on all devices
+              `PartitionSpec`: a tuple of length at most equal to the rank of the
+                partitioned value. Each element can be a `None`, a mesh axis or a
+                tuple of mesh axes, and specifies the set of resources assigned to
+                partition the value's dimension matching its position in the spec.
+          out_axis_resources: Like `in_axis_resources`, but specifies resource
+            assignment for function outputs.
+          static_argnums: an optional int or collection of ints that specify which
+            positional arguments to treat as static (compile-time constant) in the
+            partitioned function.
+          donate_argnums: an optional int or collection of ints that specify which
+            argument buffers are "donated" to the computation. It is safe to donate
+            argument buffers if you no longer need them once the computation has
+            finished.
+        Returns:
+          A partitioned version of the input function.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def compile(self, partitioned_fn: PartitionedCallable, *args) -> CompiledPartitionedCallable:
+        """Compiles and returns the partitioned function, or the original.
+        Args:
+          partitioned_fn: The partitioned function.
+          *args: Sample arguments to the partitioned function matching the input
+            shapes that will be passed to the compiled function.
+        Returns:
+          The compiled function, or the original if this partitioner does not
+          support compilation.
+        """
+        raise NotImplementedError
+class PjittedFnWithContext(PartitionedCallable):
+    """Wraps pjitted function to apply the appropriate contexts."""
+    def __init__(
+        self,
+        pjitted_fn,
+        partition_mesh: Mesh,
+        logical_axis_rules: flax_partitioning.LogicalRules = (),
+    ):
+        self._pjitted_fn = pjitted_fn
+        self._mesh = partition_mesh
+        self._logical_axis_rules = logical_axis_rules
+    def __call__(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn(*args)
+    def lower(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn.lower(*args)
+class BasePjitPartitioner(BasePartitioner):
+    """Partitioner that uses T5X version of jax.pjit."""
+    @cached_property
+    def _local_chunker(self) -> LocalChunker:
+        return LocalChunker(self.mesh)
+    @cached_property
+    def mesh(self) -> Mesh:
+        return default_mesh(self._num_partitions, self._model_parallel_submesh, self._backend)
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        pjitted = pjit(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh)
+    def compile(self, partitioned_fn: PjittedFnWithContext, *args) -> CompiledPartitionedCallable:
+        return partitioned_fn.lower(*args).compile()
+class PjitPartitioner(BasePjitPartitioner):
+    """Partitioner that uses named axes and jax.pjit."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+        logical_axis_rules: Optional[LogicalAxisRules] = None,
+        use_cpu_pjit: Optional[bool] = False,
+    ):
+        """PjitPartitioner constructor.
+        See https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.mdx/usage/partitioning for details.
+        Args:
+          num_partitions: an integer that specifies the size of the model parallel
+            submesh to be automatically selected for the current topology. See
+            `model_parallel_submesh` for details on how this submesh is used.
+            Mutually exlusive with `model_parallel_submesh`.
+          model_parallel_submesh: is a 4-tuple that specifies the `(x, y, z, c)`
+            submesh model-parallel device tile, an axis of accelerator parallelism
+            orthogonal to data parallelism. Array axes in a model's parameters or
+            activations can be sharded over this submesh using axis rules (see
+            `logical_axis_rules`) that map them to 'model'. The effective number of
+            model sub-partitions is equal to `np.prod(model_parallel_submesh)` and
+            must evenly divide the total number of devices (i.e.,
+            `jax.device_count() % np.prod(model_parallel_submesh) == 0`). The rest
+            of the TPU mesh is the data parallel submesh, providing
+            `jax.device_count() // np.prod(model_parallel_submesh)` partitions. It
+            is used for data (batch) parallelism and to shard other array axes that
+            are mapped to 'data'. This argument is mutually exclusive with
+            `num_partitions`.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is
+            useful for explicitly specifying the devices other than relying on
+            jax_platform_name.
+          logical_axis_rules: a priority-ordered sequence of KV tuples that maps
+            logical axis names to either `None` (not sharded), 'model' (to shard
+            across the model-parallel submesh), or 'data' (to shard across the
+            data-parallel submesh).
+          use_cpu_pjit: enables wrapper function for pjit which just jits the
+            function if using CPU backend.
+        """
+        super().__init__(
+            num_partitions=num_partitions,
+            model_parallel_submesh=model_parallel_submesh,
+            params_on_devices=params_on_devices,
+            backend=backend,
+        )
+        if logical_axis_rules is None:
+            logical_axis_rules = standard_logical_axis_rules()
+        self._logical_axis_rules = tuple(logical_axis_rules)
+        (self._data_axis,) = flax_partitioning.logical_to_mesh_axes(["batch"], logical_axis_rules)
+        self._use_cpu_pjit = use_cpu_pjit
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        """Partitions the function using jax.pjit."""
+        if self._use_cpu_pjit:
+            pjit_fn = pjit_with_cpu_fallback
+        else:
+            pjit_fn = pjit
+        pjitted = pjit_fn(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh, self._logical_axis_rules)
+    @property
+    def logical_axis_rules(self):
+        """Returns the logical axis rules."""
+        return self._logical_axis_rules
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        return train_state.as_logical_axes()
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        logical_axes = self.get_logical_axes(train_state)
+        def _logical_to_mesh_axes(param_name, logical_axes):
+            if logical_axes is None:
+                return None
+            elif logical_axes is traverse_util.empty_node:
+                return traverse_util.empty_node
+            try:
+                return flax_partitioning.logical_to_mesh_axes(logical_axes, self._logical_axis_rules)
+            except ValueError as e:
+                raise ValueError(f"Failed to map logical axes for {param_name}") from e
+        flat_logical_axes = traverse_util.flatten_dict(logical_axes.state_dict(), keep_empty_nodes=True, sep="/")
+        flat_mesh_axes = {k: _logical_to_mesh_axes(k, v) for k, v in flat_logical_axes.items()}
+        return logical_axes.restore_state(traverse_util.unflatten_dict(flat_mesh_axes, sep="/"))

distil_whisper/pipeline.py ADDED Viewed

	@@ -0,0 +1,527 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Whisper JAX pipeline compatible with Distil Whisper checkpoints. Copied from https://github.com/sanchit-gandhi/whisper-jax/blob/main/whisper_jax/pipeline.py"""
+import math
+import jax
+import jax.numpy as jnp
+import numpy as np
+import requests
+import torch
+from flax import jax_utils
+from flax.core.frozen_dict import freeze
+from flax.training.common_utils import shard
+from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
+from transformers.pipelines.audio_utils import ffmpeg_read
+from transformers.utils import logging
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+logger = logging.get_logger(__name__)
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+class FlaxWhisperPipeline:
+    def __init__(
+        self,
+        checkpoint="openai/whisper-large-v2",
+        dtype=jnp.float32,
+        batch_size=None,
+        max_length=None,
+        **kwargs,
+    ):
+        """
+        Args
+            checkpoint (`str`, *optional*, defaults to `"openai/whisper-large-v2"):
+                The Whisper checkpoint to use with the pipeline. Must be an available checkpoint on the Hugging Face Hub
+                with Flax weights.
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs). This can be used to enable half-precision inference on GPUs or TPUs.
+                If specified all the computation will be performed with the given `dtype`. **Note that this only
+                specifies the dtype of the computation and does not influence the dtype of model parameters.**
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__init__` method will be superseded by any batch size passed to the `__call__` method.
+            max_length (`int`, *optional*):
+                The maximum numbers of tokens to generate. Defaults to `model.config.max_length`.
+        """
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(self.checkpoint)
+        self.tokenizer = WhisperTokenizerFast.from_pretrained(self.checkpoint)
+        self.model, self.params = FlaxWhisperForConditionalGeneration.from_pretrained(
+            self.checkpoint,
+            _do_init=False,
+            dtype=self.dtype,
+            **kwargs,
+        )
+        self.max_length = max_length if max_length is not None else self.model.generation_config.max_length
+        self.min_batch_size = jax.local_device_count()
+        self.batch_size = (
+            batch_size if batch_size is not None else self.min_batch_size
+        )  # we need a minimum of 1 batch per-device
+        def generate(
+            params,
+            input_features,
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ):
+            output_ids = self.model.pipeline_generate(
+                input_features,
+                params=params,
+                forced_decoder_ids=forced_decoder_ids,
+                return_timestamps=return_timestamps,
+                max_length=self.max_length,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                do_sample=do_sample,
+                top_k=top_k,
+                temperature=temperature,
+            )
+            return output_ids
+        self.params = jax_utils.replicate(self.params)
+        self.p_generate = jax.pmap(
+            generate,
+            "input_features",
+            in_axes=(0, 0, None, None, None, None, None, None, None),
+            static_broadcasted_argnums=(
+                3,
+                4,
+                5,
+                6,
+                7,
+                8,
+            ),
+        )
+    def generate(
+        self,
+        input_features,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        forced_decoder_ids = self.get_forced_decoder_ids(
+            language=language, task=task, return_timestamps=return_timestamps
+        )
+        # if we're using pmap we need to manually replicate the input data across devices and gather the output tokens
+        output_ids = self.p_generate(
+            freeze(self.params),
+            shard(input_features),
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ).sequences
+        output_ids = jax.device_get(output_ids.reshape(-1, self.max_length))
+        return output_ids
+    def get_forced_decoder_ids(self, generation_config=None, task=None, language=None, return_timestamps=False):
+        if generation_config is None:
+            generation_config = self.model.generation_config
+        if hasattr(generation_config, "is_multilingual"):
+            is_multilingual = generation_config.is_multilingual
+        else:
+            is_multilingual = None
+        forced_decoder_ids = []
+        if is_multilingual:
+            if language is not None:
+                language = language.lower()
+                if language in generation_config.lang_to_id.keys():
+                    language_token = language
+                elif language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{language}|>"
+                elif language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+                else:
+                    if len(language) == 2:
+                        # ISO 639-1 language code
+                        acceptable_languages = list(TO_LANGUAGE_CODE.values())
+                    elif "<" in language or "|" in language or ">" in language:
+                        # generation config language code
+                        acceptable_languages = list(generation_config.lang_to_id.keys())
+                    else:
+                        # language passed as a string
+                        acceptable_languages = list(TO_LANGUAGE_CODE.keys())
+                    raise ValueError(
+                        f"Unsupported language: {language}. Language should be one of:" f" {acceptable_languages}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            if task is not None:
+                forced_decoder_ids.append((2, generation_config.task_to_id[task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if not return_timestamps:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+            else:
+                forced_decoder_ids.append((1, generation_config.no_timestamps_token_id))
+        return forced_decoder_ids
+    def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size):
+        inputs_len = inputs.shape[0]
+        step = chunk_len - stride_left - stride_right
+        all_chunk_start_idx = np.arange(0, inputs_len, step)
+        num_samples = len(all_chunk_start_idx)
+        num_batches = math.ceil(num_samples / batch_size)
+        batch_idx = np.array_split(np.arange(num_samples), num_batches)
+        for idx in batch_idx:
+            chunk_start_idx = all_chunk_start_idx[idx]
+            chunk_end_idx = chunk_start_idx + chunk_len
+            chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)]
+            processed = self.feature_extractor(
+                chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            _stride_left = np.where(chunk_start_idx == 0, 0, stride_left)
+            is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len)
+            _stride_right = np.where(is_last, 0, stride_right)
+            chunk_lens = [chunk.shape[0] for chunk in chunks]
+            strides = [
+                (chunk_l, _stride_l, _stride_r)
+                for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right)
+            ]
+            yield {"stride": strides, **processed}
+    def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
+        if isinstance(inputs, np.ndarray):
+            logger.warning(
+                "Numpy array passed as input - no sampling rate checks will be performed."
+                "It is strongly recommended to pass the input as a dictionary with an 'array' key "
+                "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                "containing the sampling rate associated with the audio array."
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+        stride = None
+        if isinstance(inputs, dict):
+            stride = inputs.get("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and "array" in inputs):
+                raise ValueError(
+                    "When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
+                    "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                    "containing the sampling rate associated with the audio array."
+                )
+            in_sampling_rate = inputs.get("sampling_rate")
+            inputs = inputs.get("array", None)
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                try:
+                    import librosa
+                except ImportError as err:
+                    raise ImportError(
+                        "To support resampling audio files, please install 'librosa' and 'soundfile'."
+                    ) from err
+                inputs = librosa.resample(
+                    inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
+                )
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        if stride is not None:
+            if stride[0] + stride[1] > inputs.shape[0]:
+                raise ValueError("Stride is too large for input")
+            # Stride needs to get the chunk length here, it's going to get
+            # swallowed by the `feature_extractor` later, and then batching
+            # can add extra data in the inputs, so we need to keep track
+            # of the original length in the stride so we can cut properly.
+            stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+            chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
+            stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
+            stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+            for item in self.chunk_iter_with_batch(
+                inputs,
+                chunk_len,
+                stride_left,
+                stride_right,
+                batch_size,
+            ):
+                yield item
+        else:
+            processed = self.feature_extractor(
+                inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            if stride is not None:
+                processed["stride"] = stride
+            yield processed
+    def postprocess(self, model_outputs, return_timestamps=None, return_language=None):
+        # unpack the outputs from list(dict(list)) to list(dict)
+        model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())]
+        time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+        # Send the chunking back to seconds, it's easier to handle in whisper
+        sampling_rate = self.feature_extractor.sampling_rate
+        for output in model_outputs:
+            if "stride" in output:
+                chunk_len, stride_left, stride_right = output["stride"]
+                # Go back in seconds
+                chunk_len /= sampling_rate
+                stride_left /= sampling_rate
+                stride_right /= sampling_rate
+                output["stride"] = chunk_len, stride_left, stride_right
+        text, optional = self.tokenizer._decode_asr(
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+        return {"text": text, **optional}
+    def forward(
+        self,
+        model_inputs,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        # We need to keep track of some additional input arguments for post-processing so need to forward these on after running generation
+        input_features = model_inputs.pop("input_features")
+        input_batch_size = input_features.shape[0]
+        if input_batch_size != batch_size:
+            padding = np.zeros([batch_size - input_batch_size, *input_features.shape[1:]], input_features.dtype)
+            input_features = np.concatenate([input_features, padding])
+        pred_ids = self.generate(
+            input_features,
+            language=language,
+            task=task,
+            return_timestamps=return_timestamps,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            do_sample=do_sample,
+            top_k=top_k,
+            temperature=temperature,
+        )[:input_batch_size]
+        # tokenizer's decode method expects an extra dim - we insert it here for convenience
+        out = {"tokens": pred_ids[:, None, :]}
+        stride = model_inputs.pop("stride", None)
+        if stride is not None:
+            out["stride"] = stride
+        return out
+    def __call__(
+        self,
+        inputs,
+        chunk_length_s=30.0,
+        stride_length_s=None,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=None,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        """
+        Transcribe an audio input sequence to a text transcription, optionally with timestamps.
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either:
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` is the byte content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio assumed to be at the correct sampling rate (16kHz). Note that no further sampling
+                        rate check will be done.
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "array":
+                      np.array}`. Optionally an additional argument `"stride": (left: int, right: int)` can be used to
+                       ask the pipeline to treat the first `left` samples and last `right` samples to be ignored in
+                       decoding (but used at inference to provide more context to the model). In general, this additional
+                       stride argument is not required.
+            chunk_length_s (`float`, *optional*, defaults to 30.0):
+                The input length for each chunk. If `chunk_length_s = 0` then chunking is disabled. By default, the chunk
+                length is set 30.0s, equal to Whisper's context window.
+            stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+                The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+                the model to *see* more context and infer letters better than without this context but the pipeline
+                discards the stride bits at the end to make the final reconstitution as perfect as possible.
+                <Tip>
+                For more information on how to effectively use `stride_length_s`, refer to the [ASR chunking
+                blog post](https://huggingface.co/blog/asr-chunking).
+                </Tip>
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__call__` method will supersede any batch size passed to the `__init__`.
+            task (`str`, *optional*):
+                Task to use for generation, either `"transcribe"` or `"translate"`. Defaults to `"transcribe"`.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `"<|en|>"`, `"en"` or `"english"`.
+                Defaults to `None`, meaning the language is automatically inferred from the audio input.
+            return_timestamps (*optional*, `bool`):
+                Whether to return timestamps in the prediction. Defaults to False. If set to true, the pipeline
+                will return two keys in the output dictionary: `"text"` containing the text transcription, and `"chunks"`
+                containing the transcription segments chunked by their utterance-level timestamps.
+            length_penalty (*optional*, `float`):
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an
+                exponent to the sequence length, which in turn is used to divide the score of the sequence. Since
+                the score is the log likelihood of the sequence (i.e. negative), length_penalty > 1.0 promotes
+                longer sequences, while length_penalty < 1.0 encourages shorter sequences.
+            do_sample (*optional*, `bool`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            top_k (*optional*, `int`):
+                The number of the highest probability vocabulary tokens to keep for top-k-filtering.
+            temperature (*optional*, `float`):
+                The value used to modulate the next token probabilities if sampling.
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str` ) -- The recognised text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
+                    "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        if batch_size % self.min_batch_size != 0:
+            raise ValueError(
+                f"Batch size must be a multiple of the number of JAX devices, but got batch size {batch_size} and num devices {self.min_batch_size}."
+            )
+        dataloader = self.preprocess_batch(
+            inputs, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, batch_size=batch_size
+        )
+        model_outputs = []
+        # iterate over our chunked audio samples
+        for batch in dataloader:
+            model_outputs.append(
+                self.forward(
+                    batch,
+                    batch_size=batch_size,
+                    language=language,
+                    task=task,
+                    return_timestamps=return_timestamps,
+                    num_beams=num_beams,
+                    length_penalty=length_penalty,
+                    do_sample=do_sample,
+                    top_k=top_k,
+                    temperature=temperature,
+                )
+            )
+        post_processed = self.postprocess(model_outputs, return_timestamps=return_timestamps)
+        return post_processed

distil_whisper/train_state.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import Any, Mapping, MutableMapping, Optional, Tuple
+import flax.core
+import flax.serialization
+import flax.struct
+import jax.numpy as jnp
+from flax import traverse_util
+from flax.core import scope as flax_scope
+from flax.linen import partitioning as flax_partitioning
+EMPTY_DICT = flax.core.freeze({})
+FrozenDict = flax_scope.FrozenDict
+FrozenVariableDict = flax_scope.FrozenVariableDict
+MutableVariableDict = flax_scope.MutableVariableDict
+VariableDict = flax_scope.VariableDict
+def _validate_params_axes(params_axes, params):
+    axis_names = flax_partitioning.get_axis_names(params_axes)
+    missing_params_axes = set(traverse_util.flatten_dict(params, sep="/")) - set(
+        traverse_util.flatten_dict(axis_names, sep="/")
+    )
+    if missing_params_axes:
+        raise ValueError(f"Missing axis names for parameters: {missing_params_axes}")
+def _split_variables_and_axes(
+    variables_and_axes: FrozenVariableDict,
+) -> Tuple[FrozenVariableDict, FrozenVariableDict]:
+    """Splits `variables_and_axes` into two separate dicts with the same keys."""
+    # For each `key`, `key_axes` (if any) are its axes in `variables_and_axes`.
+    variables = {}
+    axes = {}
+    for k, v in variables_and_axes.items():
+        if k.endswith("_axes"):
+            axes[k[:-5]] = v  # k without "_axes".
+            _validate_params_axes(v, variables_and_axes[k[:-5]])  # k without "_axes".
+        else:
+            variables[k] = v
+    return flax.core.freeze(variables), flax.core.freeze(axes)
+class InferenceState(flax.struct.PyTreeNode):
+    """State compatible with FlaxOptimTrainState without optimizer state."""
+    step: jnp.ndarray
+    params: flax_scope.FrozenVariableDict
+    params_axes: Optional[flax_scope.FrozenVariableDict] = None
+    flax_mutables: flax_scope.FrozenDict = EMPTY_DICT
+    flax_mutables_axes: Optional[flax_scope.FrozenVariableDict] = None
+    @classmethod
+    def create(cls, model_variables: FrozenVariableDict) -> "InferenceState":
+        other_variables, params = model_variables.pop("params")
+        if "params_axes" in other_variables:
+            other_variables, params_axes = other_variables.pop("params_axes")
+            _validate_params_axes(params_axes, params)
+        else:
+            params_axes = None
+        # Split other_variables into mutables and their corresponding axes.
+        flax_mutables, flax_mutables_axes = _split_variables_and_axes(other_variables)
+        flax_mutables_axes = flax_mutables_axes or None
+        return InferenceState(
+            step=jnp.array(0),
+            params=params,
+            params_axes=params_axes,
+            flax_mutables=flax_mutables,
+            flax_mutables_axes=flax_mutables_axes,
+        )
+    @property
+    def param_states(self) -> FrozenVariableDict:
+        """The optimizer states of the parameters as a PyTree."""
+        raise NotImplementedError("InferenceState has no optimizer states.")
+    def apply_gradient(self, *args, **kwargs) -> "InferenceState":
+        raise NotImplementedError("InferenceState does not support `apply_gradient`.")
+    def state_dict(self) -> MutableMapping[str, Any]:
+        state_dict = {
+            "target": flax.core.unfreeze(self.params),
+            "state": {"step": self.step},
+        }
+        if self.flax_mutables:
+            state_dict["flax_mutables"] = flax.core.unfreeze(self.flax_mutables)
+        return state_dict
+    def replace_step(self, step: jnp.ndarray) -> "InferenceState":
+        return self.replace(step=step)
+    def replace_params(self, params: FrozenVariableDict) -> "InferenceState":
+        return self.replace(params=params)
+    def replace_flax_mutables(self, flax_mutables: FrozenDict) -> "InferenceState":
+        return self.replace(flax_mutables=flax_mutables)
+    def restore_state(self, state_dict: Mapping[str, Any]) -> "InferenceState":
+        return self.replace(
+            params=flax.core.freeze(state_dict["target"]),
+            step=state_dict["state"]["step"],
+            flax_mutables=(
+                flax.core.freeze(state_dict["flax_mutables"]) if "flax_mutables" in state_dict else EMPTY_DICT
+            ),
+        )
+    def as_logical_axes(self) -> "InferenceState":
+        # Set step to None so that when the logical axes are processed by the
+        # flax.partitioning.logical_to_mesh_axes function, it will be skipped
+        # because jax.tree_map will short circut and never call the function on the
+        # step.
+        flax_mutables_axes = self.flax_mutables_axes or EMPTY_DICT
+        return InferenceState(
+            step=None,
+            params=flax_partitioning.get_axis_names(self.params_axes),
+            flax_mutables=flax_partitioning.get_axis_names(flax_mutables_axes),
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "no",
+  "max_initial_timestamp_index": 1,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task": "transcribe",
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.46.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dceef1c98c82eee48a3a948d1ca88682b48946a66a0d989d6be8c1c49205bed
+size 3025686376

nb-distil-large-init/added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

nb-distil-large-init/config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "_name_or_path": "NbAiLab/nb-whisper-large",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0,
+  "begin_suppress_tokens": null,
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": null,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

nb-distil-large-init/generation_config.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "<|no|>",
+  "max_initial_timestamp_index": 1,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task": "transcribe",
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.46.2"
+}

nb-distil-large-init/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nb-distil-large-init/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5c6cce6fdc832cc805fe6ea8af8db5527ba4b5d0c3381ba404c82e3e8161db6
+size 3025686376

nb-distil-large-init/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

nb-distil-large-init/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nb-distil-large-init/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nb-distil-large-init/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

run_distillation.py ADDED Viewed

	@@ -0,0 +1,1827 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
+"""
+# You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
+import logging
+import os
+import re
+import shutil
+import sys
+import time
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import datasets
+import evaluate
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import (
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    interleave_datasets,
+    load_dataset,
+)
+from huggingface_hub import create_repo, get_full_repo_name, upload_folder
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AddedToken,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    WhisperTokenizerFast,
+    get_scheduler
+)
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.34.0.dev0")
+require_version("datasets>=2.14.6", "To fix: `pip install --upgrade datasets`")
+logger = get_logger(__name__)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to distill from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained Whisper model or model identifier from huggingface.co/models"}
+    )
+    teacher_model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained teacher model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "feature extractor name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    subfolder: str = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Which attention implementation to use in the encoder and decoder attention layers. Can be one of:\n"
+                "1. `eager` or `None`: default Transformers attention implementation.\n"
+                "2. `sdpa`: Flash Attention through PyTorch SDPA. Requires `torch>=2.1`. Recommended for hardware where Flash Attention 2 is not supported, e.g. Turing GPUs, (T4, RTX 2080).\n"
+                "3. `flash_attn_2`: Flash Attention 2 through the Flash Attention package https://github.com/Dao-AILab/flash-attention. **Always** recommended on supported hardware (Ampere, Ada, or Hopper GPUs, e.g., A100, RTX 3090, RTX 4090, H100)."
+            )
+        },
+    )
+    def __post_init__(self):
+        if self.attn_implementation not in [None, "eager", "sdpa", "flash_attention_2"]:
+            raise ValueError(
+                f"Got `--attn_implementation={self.attn_implementation}`, which is an invalid attention type. Should be one of:\n"
+                "1. `eager` or `None`: default Transformers attention implementation.\n"
+                "2. `sdpa`: Flash Attention through PyTorch SDPA. Requires `torch>=2.1`. Recommended for hardware where Flash Attention 2 is not supported, e.g. Turing GPUs, (T4, RTX 2080).\n"
+                "3. `flash_attn_2`: Flash Attention 2 through the Flash Attention package https://github.com/Dao-AILab/flash-attention. **Always** recommended on supported hardware (Ampere, Ada, or Hopper GPUs, e.g., A100, RTX 3090, RTX 4090, H100)."
+            )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load LibriSpeech "
+            "and Common Voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol. Note that the order of the configs should "
+            "match the order of the datasets."
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in each dataset when loading multiple datasets with streaming mode. "
+            "Not required when using one dataset or non-streaming mode. The sample values provide the sampling "
+            "probability for each dataset. Setting them equal to the number of sample values ensures that every "
+            "sample from every dataset is used once per epoch."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training "
+            "dataset name if unspecified. Load multiple evaluation datasets by separating dataset "
+            "ids by a '+' symbol."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the "
+            "training dataset config name if unspecified."
+        },
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to cache directory for saving and loading datasets"},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing if using non-streaming mode."},
+    )
+    preprocessing_batch_size: Optional[int] = field(
+        default=256,
+        metadata={"help": "Number of examples per batch provided to the `prepare_dataset` function."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set."
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the text data in the training set."},
+    )
+    eval_text_column_name: str = field(
+        default="text",
+        metadata={"help": ("The name of the dataset column containing the text data in the evaluation set.")},
+    )
+    max_duration_in_seconds: float = field(
+        default=30.0,
+        metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
+    )
+    max_label_length: int = field(
+        default=448,
+        metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set will pad the target sequence to a multiple of the provided"
+                " value. This is important to avoid triggering recompilations on TPU."
+                " If unspecified, will default to padding the targets to max length."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is"
+                " especially useful when data preprocessing errors out in distributed"
+                " training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with"
+                " `preprocessing_only=True` so that the cached datasets can"
+                " consequently be loaded in distributed training"
+            )
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the evaluation data set split to use (via the datasets library). Defaults to 'validation'"
+            )
+        },
+    )
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to use Datasets' streaming mode to load and pre-process the data."},
+    )
+    wer_threshold: float = field(
+        default=None,
+        metadata={
+            "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
+            "WER with the normalised transcriptions. This only takes effect if training on pseudo-labels targets."
+            "If `--use_pseudo_labels=False`, then no WER filtering is performed, since we train directly on the text"
+            "transcriptions."
+        },
+    )
+    use_pseudo_labels: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether or not to use pseudo-label transcriptions as the targets. If True, the pseudo-labels "
+            "must be in the dataset column `whisper_transcript` from the previous pseudo-labelling step. This is "
+            "not currently yet configurable."
+        },
+    )
+    timestamp_probability: float = field(
+        default=0.2, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
+    )
+    condition_on_prev_probability: float = field(
+        default=0.2, metadata={"help": "Probability for conditioning on the previous text example."}
+    )
+    return_timestamps: bool = field(
+        default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
+    )
+    language: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "Language for multilingual distillation. This argument should be set for multilingual distillation "
+                "only. For English speech recognition, it should be left as `None`."
+            )
+        },
+    )
+    task: str = field(
+        default="transcribe",
+        metadata={
+            "help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."
+            "This argument should be set for multilingual distillation only. For English speech recognition, it should be left as `None`."
+        },
+    )
+    wandb_project: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_name: str = field(
+        default=None,
+        metadata={"help": "The name of the wandb run."},
+    )
+    wandb_dir: str = field(
+        default="./wandb",
+        metadata={"help": "The dir where wandb metadata will be stored."},
+    )
+@dataclass
+class DistillationTrainingArguments(Seq2SeqTrainingArguments):
+    freeze_encoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
+                "copied from the teacher model."
+            )
+        },
+    )
+    freeze_decoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire decoder model. Note that the decoder input embeddings are **not** frozen, since they are tied to the LM head."
+            )
+        },
+    )
+    freeze_embed_positions: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to freeze the decoder embedding positions."},
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
+    )
+    kl_weight: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+    save_best_total_limit: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "Number of best models to be saved."
+            )
+        }
+    )
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The start-of-sequence token id of the decoder.
+        decoder_prev_token_id (:obj: `int`)
+            The start-of-prompt token id of the decoder
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+    """
+    processor: Any
+    decoder_start_token_id: int
+    decoder_prev_token_id: int
+    input_padding: Union[bool, str] = "max_length"
+    target_padding: Union[bool, str] = "max_length"
+    max_target_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {"input_features": [feature["input_features"] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            padding=self.input_padding,
+            return_tensors="pt",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            return_tensors="pt",
+        )
+        # shift labels to the right to get decoder input ids
+        labels = labels_batch["input_ids"]
+        decoder_input_ids = labels[:, :-1]
+        labels = labels[:, 1:]
+        labels_mask = labels_batch.attention_mask[:, 1:]
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = labels.masked_fill(labels_mask.ne(1), -100)
+        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
+        bos_index = torch.argmax((labels == self.decoder_start_token_id).long(), dim=1)
+        bos_index = torch.where(bos_index > 0, bos_index + 1, bos_index)
+        prompt_mask = torch.arange(labels.shape[1]) < bos_index[:, None]
+        labels = torch.where(prompt_mask, -100, labels)
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+def log_metric(
+    accelerator,
+    metrics: Dict,
+    train_time: float,
+    step: int,
+    epoch: int,
+    learning_rate: float = None,
+    prefix: str = "train",
+):
+    """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    if learning_rate is not None:
+        log_metrics[f"{prefix}/learning_rate"] = learning_rate
+    accelerator.log(log_metrics, step=step)
+def log_pred(
+    accelerator,
+    pred_str: List[str],
+    label_str: List[str],
+    norm_pred_str: List[str],
+    norm_label_str: List[str],
+    step: int,
+    prefix: str = "eval",
+    num_lines: int = 200000,
+):
+    """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
+    if accelerator.is_main_process:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        # pretty name for current step: step 50000 -> step 50k
+        cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
+        prefix_pretty = prefix.replace("/", "-")
+        # convert str data to a wandb compatible format
+        str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
+        # log as a table with the appropriate headers
+        wandb_tracker.log_table(
+            table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
+            columns=["Target", "Pred", "Norm Target", "Norm Pred"],
+            data=str_data[:num_lines],
+            step=step,
+        )
+        # log incorrect normalised predictions
+        str_data = np.asarray(str_data)
+        str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
+        # log as a table with the appropriate headers
+        wandb_tracker.log_table(
+            table_name=f"incorrect_predictions/{prefix_pretty}-step-{cur_step_pretty}",
+            columns=["Target", "Pred", "Norm Target", "Norm Pred"],
+            data=str_data_incorrect[:num_lines],
+            step=step,
+        )
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    splits=None,
+    text_column_names=None,
+    dataset_samples=None,
+    default_split="train",
+) -> List[Dict]:
+    """
+    Given three lists of dataset names, configs and splits, this function groups the corresponding
+    names/configs/splits. Each dataset is assigned a unique dictionary with these metadata values, and the
+    function returns a list of dictionaries, one for each dataset.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        dataset_config_names = dataset_config_names.split("+") if dataset_config_names is not None else None
+        splits = splits.split("+") if splits is not None else None
+        text_column_names = text_column_names.split("+") if text_column_names is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if dataset_config_names is not None and len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+    if text_column_names is not None and len(text_column_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(text_column_names)} text column names."
+        )
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+    dataset_config_names = (
+        dataset_config_names if dataset_config_names is not None else ["default" for _ in range(len(dataset_names))]
+    )
+    text_column_names = (
+        text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
+    )
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "text_column_name": text_column_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+def load_multiple_datasets(
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    text_column_names: Optional[List] = None,
+    sampling_rate: Optional[int] = 16000,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = True,
+    seed: Optional[int] = None,
+    accelerator: Optional[Accelerator] = None,
+    use_pseudo_labels: float = None,
+    **kwargs,
+) -> IterableDataset:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
+    )
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(
+        dataset_names_dict,
+        desc="Combining datasets...",
+        disable=not accelerator.is_local_main_process if accelerator is not None else False,
+    ):
+        dataset = load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+        # resample to specified sampling rate
+        dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
+        dataset_features = dataset.features.keys()
+        columns_to_keep = {"audio", "text"}
+        if dataset_dict["text_column_name"] not in dataset_features:
+            raise ValueError(
+                f"Text column name {dataset_dict['text_column_name']} not found in dataset"
+                f" '{dataset_dict['name']}'. Make sure to set `--text_column_name` to the"
+                f" correct text column - one of {', '.join(dataset_features)}."
+            )
+        # blanket renaming of all transcription columns to text
+        if dataset_dict["text_column_name"] != "text":
+            dataset = dataset.rename_column(dataset_dict["text_column_name"], "text")
+        if use_pseudo_labels:
+            if "whisper_transcript" not in dataset_features:
+                raise ValueError(
+                    f"Pseudo-label column `whisper_transcript` not found in dataset {dataset_dict['name']}. Ensure"
+                    "pseudo-labels are present in the dataset under this column name, or train directly on the text "
+                    "labels by setting `--use_pseudo_labels=False` and defining the appropriate `--text_column_name`."
+                )
+            columns_to_keep.add("whisper_transcript")
+        if "condition_on_prev" in dataset_features:
+            columns_to_keep.add("condition_on_prev")
+        dataset_features = dataset.features.keys()
+        dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+        all_datasets.append(dataset)
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        interleaved_dataset = concatenate_datasets(all_datasets)
+    return interleaved_dataset
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
+    """Helper function to sort saved checkpoints from oldest to newest."""
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    glob_checkpoints = [path for path in glob_checkpoints if "val-wer" not in path]  # filter out best model checkpoints
+    for path in glob_checkpoints:
+        regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def sorted_best_checkpoints(output_dir=None, checkpoint_prefix="checkpoint"):
+    """Helper function to sort saved best checkpoints."""
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    for path in glob_checkpoints:
+        regex_match = re.search(r"val-wer-([0-9]+\.[0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((regex_match.groups(1), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path, reverse=True)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint", sorting_fn=sorted_checkpoints) -> None:
+    """Helper function to delete old checkpoints."""
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorting_fn(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}].")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+_RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path
+        for path in content
+        if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
+def get_parameter_names(model, forbidden_layer_types, forbidden_module=None):
+    """
+    Returns the names of the model parameters that are not inside a forbidden layer or forbidden module.
+    Can be used to get a subset of parameter names for decay masks, or to exclude parameters from an optimiser
+    (e.g. if the module is frozen).
+    """
+    result = []
+    for name, child in model.named_children():
+        result += [
+            f"{name}.{n}"
+            for n in get_parameter_names(child, forbidden_layer_types, forbidden_module)
+            if not (
+                isinstance(child, tuple(forbidden_layer_types))
+                or (child in tuple(forbidden_module) if forbidden_module is not None else False)
+            )
+        ]
+    # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
+    result += list(model._parameters.keys())
+    return result
+def main():
+    # 1. Parse input arguments
+    # We keep distinct sets of args, for cleaner separation of model/data/training related args
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, DistillationTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # 2. Initialize the accelerator
+    # We will let the accelerator handle device placement for us in this example
+    # We simply have to specify the training precision and any trackers being used
+    # We'll use the same dtype arguments as our JAX/Flax training script and convert
+    # it to accelerate format
+    if training_args.dtype == "float16":
+        mixed_precision = "fp16"
+        teacher_dtype = torch.float16
+    elif training_args.dtype == "bfloat16":
+        mixed_precision = "bf16"
+        teacher_dtype = torch.bfloat16
+    else:
+        mixed_precision = "no"
+        teacher_dtype = torch.float32
+    accelerator = Accelerator(
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=training_args.report_to,
+        project_dir=training_args.output_dir,
+    )
+    accelerator.init_trackers(
+        project_name=data_args.wandb_project,
+        init_kwargs={
+            "wandb": {"name": data_args.wandb_name,
+                      "dir": data_args.wandb_dir}
+        }
+    )
+    # 3. Set-up basic logging
+    # Create one log on every process with the configuration for debugging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Log a small summary on each proces
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # 4. Detecting last checkpoint and eventually continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # 5. Handle the repository creation
+    if accelerator.is_main_process:
+        if training_args.push_to_hub:
+            if training_args.hub_model_id is None:
+                repo_name = get_full_repo_name(
+                    Path(training_args.output_dir).absolute().name,
+                    token=training_args.hub_token,
+                )
+            else:
+                repo_name = training_args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+            with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "wandb" not in gitignore:
+                    gitignore.write("wandb\n")
+        elif training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+    # 6. Load dataset - either streaming or non-streaming (offline)
+    raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # set seed for determinism
+    set_seed(training_args.seed)
+    if training_args.do_train:
+        raw_datasets["train"] = load_multiple_datasets(
+            data_args.train_dataset_name,
+            data_args.train_dataset_config_name,
+            splits=data_args.train_split_name,
+            text_column_names=data_args.text_column_name,
+            use_pseudo_labels=data_args.use_pseudo_labels,
+            streaming=data_args.streaming,
+            dataset_samples=data_args.train_dataset_samples,
+            seed=training_args.seed,
+            accelerator=accelerator,
+            cache_dir=data_args.dataset_cache_dir,
+            token=model_args.token,
+        )
+        raw_datasets_train_features = list(raw_datasets["train"].features.keys())
+    if training_args.do_eval:
+        dataset_names_dict = convert_dataset_str_to_list(
+            data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+            (
+                data_args.eval_dataset_config_name
+                if data_args.eval_dataset_config_name
+                else data_args.train_dataset_config_name
+            ),
+            splits=data_args.eval_split_name,
+            text_column_names=data_args.eval_text_column_name,
+        )
+        all_eval_splits = []
+        if len(dataset_names_dict) == 1:
+            # load a single eval set
+            dataset_dict = dataset_names_dict[0]
+            all_eval_splits.append("eval")
+            raw_datasets["eval"] = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                cache_dir=data_args.dataset_cache_dir,
+                token=model_args.token,
+                streaming=data_args.streaming,
+            )
+            if data_args.eval_text_column_name != "text":
+                raw_datasets["eval"] = raw_datasets["eval"].rename_column(data_args.eval_text_column_name, "text")
+        else:
+            # load multiple eval sets
+            for dataset_dict in dataset_names_dict:
+                if dataset_dict["name"] == "esb/diagnostic-dataset":
+                    # for the ESB diagnostic dataset, the dataset name is effectively the config
+                    pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
+                else:
+                    pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
+                all_eval_splits.append(pretty_name)
+                raw_datasets[pretty_name] = load_dataset(
+                    dataset_dict["name"],
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    cache_dir=data_args.dataset_cache_dir,
+                    token=model_args.token,
+                    streaming=data_args.streaming,
+                )
+                # make column names consistent (text, audio)
+                if dataset_dict["text_column_name"] != "text":
+                    raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
+                        dataset_dict["text_column_name"], "text"
+                    )
+                raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
+                    set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
+                )
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+    # 7. Load pretrained model, tokenizer, and feature extractor
+    config = WhisperConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(
+        (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    tokenizer = WhisperTokenizerFast.from_pretrained(
+        (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    # override timestamp tokens until tokenizer issues are fixed in transformers
+    timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
+    tokenizer.add_tokens(timestamps)
+    # The teacher model can safely be cast to the dtype of training since we don't
+    # update the params
+    teacher_model = WhisperForConditionalGeneration.from_pretrained(
+        model_args.teacher_model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=model_args.token,
+        low_cpu_mem_usage=True,
+        torch_dtype=teacher_dtype,
+        attn_implementation=model_args.attn_implementation,
+    )
+    student_model = WhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        subfolder=model_args.subfolder,
+        token=model_args.token,
+        low_cpu_mem_usage=True,
+        attn_implementation=model_args.attn_implementation,
+    )
+    if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
+        raise ValueError(
+            f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
+            f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
+            f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
+        )
+    # enable gradient checkpointing if necessary
+    if training_args.gradient_checkpointing:
+        student_model.gradient_checkpointing_enable()
+    def set_trainable_parameters(module, requires_grad=False):
+        for param in module.parameters():
+            param.requires_grad = requires_grad
+        module._requires_grad = requires_grad
+    # freeze student encoder if necessary
+    if training_args.freeze_encoder:
+        set_trainable_parameters(student_model.model.encoder, requires_grad=False)
+        student_model.model.encoder.gradient_checkpointing = False
+    if training_args.freeze_decoder:
+        set_trainable_parameters(student_model.model.decoder, requires_grad=False)
+        student_model.model.decoder.gradient_checkpointing = False
+        # un-freeze LM head parameters (and consequently word embeddings), frozen when frozing decoder since tied word embedding and LM head
+        set_trainable_parameters(student_model.proj_out, requires_grad=True)
+    if training_args.freeze_embed_positions:
+        # set_trainable_parameters(student_model.model.decoder.embed_tokens, requires_grad=False)
+        set_trainable_parameters(student_model.model.decoder.embed_positions, requires_grad=False)
+        if student_model.model.decoder.gradient_checkpointing:
+            logger.info(
+                "Disabling gradient checkpointing in the decoder since it's incompatible with `freeze_embed_positions`."
+            )
+    logger.info(
+        f"Number of trainable parameters: {sum(p.numel() for p in student_model.parameters() if p.requires_grad):.3e}"
+    )
+    share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
+    if share_hidden_states:
+        # tie the weights for the teacher encoder if we're freezing the student and it's the same as the teacher
+        teacher_model.model.encoder = student_model.model.encoder
+    if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
+        # We need to set the language and task ids for previously multilingual checkpoints
+        is_multilingual = True
+        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task, predict_timestamps=False)
+        student_model.generation_config.update(
+            **{
+                "language": data_args.language,
+                "task": data_args.task,
+            }
+        )
+    elif data_args.language is not None:
+        raise ValueError(
+            "Setting language token for an English-only checkpoint is not permitted. The language argument should "
+            "only be set for multilingual checkpoints."
+        )
+    else:
+        is_multilingual = False
+    # 8. Create a single speech processor - make sure all processes wait until data is saved
+    if accelerator.is_main_process:
+        feature_extractor.save_pretrained(training_args.output_dir)
+        tokenizer.save_pretrained(training_args.output_dir)
+        # save the config and generation config as well
+        config.save_pretrained(training_args.output_dir)
+        student_model.generation_config.save_pretrained(training_args.output_dir)
+    accelerator.wait_for_everyone()
+    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    # 9. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    sampling_rate = feature_extractor.sampling_rate
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name,
+        datasets.features.Audio(sampling_rate=sampling_rate),
+    )
+    # 10. Preprocessing the datasets: we need to read the audio files as arrays and tokenize the targets.
+    # 10.1: Define the pre-processing constants
+    max_input_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    )
+    timestamp_probability = data_args.timestamp_probability
+    condition_on_prev_probability = data_args.condition_on_prev_probability
+    return_timestamps = data_args.return_timestamps if timestamp_probability > 0 else False
+    timestamp_ids = tokenizer.timestamp_ids()
+    timestamp_begin = tokenizer.all_special_ids[-1]
+    timestamp_position = 3 if is_multilingual else 1
+    decoder_start_token_id = student_model.config.decoder_start_token_id  # <|startoftranscript|>
+    decoder_prev_token_id = tokenizer.all_special_ids[-3]  # <|startofprev|>
+    prompt_cutoff_length = max_label_length // 2
+    num_workers = data_args.preprocessing_num_workers
+    dataloader_num_workers = training_args.dataloader_num_workers
+    prefetch_factor = training_args.dataloader_prefetch_factor
+    metric = evaluate.load("wer")
+    normalizer = (
+        BasicTextNormalizer()
+        if data_args.language is not None
+        else EnglishTextNormalizer(tokenizer.english_spelling_normalizer)
+    )
+    wer_threshold = data_args.wer_threshold
+    use_pseudo_labels = data_args.use_pseudo_labels
+    train_text_column_name = "whisper_transcript" if use_pseudo_labels else "text"
+    # 10.2: filter based on maximum number of training/evaluation samples
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = (
+            raw_datasets["train"].take(data_args.max_train_samples)
+            if data_args.streaming
+            else raw_datasets["train"].select(range(data_args.max_train_samples))
+        )
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        for eval_split in all_eval_splits:
+            raw_datasets[eval_split] = (
+                raw_datasets[eval_split].take(data_args.max_eval_samples)
+                if data_args.streaming
+                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+            )
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    def is_wer_in_range(ground_truth, whisper_transcript):
+        norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
+            norm_whisper_transcript = normalizer(whisper_transcript)
+            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+            return wer < wer_threshold
+        else:
+            # filter automatically since weR
+            return False
+    filter_by_wer_threshold = partial(
+        raw_datasets["train"].filter,
+        function=is_wer_in_range,
+        input_columns=["text", "whisper_transcript"],
+    )
+    if wer_threshold is not None and use_pseudo_labels:
+        with accelerator.main_process_first():
+            raw_datasets["train"] = (
+                filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+                if not data_args.streaming
+                else filter_by_wer_threshold()
+            )
+    # 10.4: pre-process training/evaluation datasets
+    def prepare_train_dataset(batch):
+        """
+        Pre-process the raw dataset in a three stage process:
+            1. Convert the audio arrays to log-mel spectrogram inputs
+            2. Possibly filter the timestamp tokens from the token ids (depending on the timestamp probability)
+            3. Possibly add prompt tokens if conditioning on previous text (depending on the conditioning probability)
+        """
+        # process audio input
+        audio = [sample["array"] for sample in batch["audio"]]
+        inputs = feature_extractor(audio, sampling_rate=sampling_rate)
+        batch["input_features"] = inputs.input_features
+        batch["input_length"] = [len(sample) for sample in audio]
+        # process text targets - for training these are the Whisper-generated pseudo-labels
+        input_str_batched = batch[train_text_column_name]
+        condition_on_prev_batched = batch.get("condition_on_prev", len(input_str_batched) * [None])
+        all_token_ids = []
+        all_token_ids_unprompted = []
+        for prev_ids, input_str in zip(condition_on_prev_batched, input_str_batched):
+            token_ids = tokenizer(input_str, add_special_tokens=not use_pseudo_labels).input_ids
+            # check whether we have timestamps in the PLs and filter if required
+            has_timestamps = len(set(token_ids) & set(timestamp_ids)) > 0
+            if has_timestamps:
+                # sample from binomial distribution to get probability of training on timestamps
+                predict_timestamps = bool(np.random.binomial(1, timestamp_probability))
+                if not predict_timestamps:
+                    # filter timestamps and insert the <|notimestamps|> task token
+                    token_ids = [token for token in token_ids if token < timestamp_begin]
+                    token_ids.insert(timestamp_position, timestamp_begin)
+            all_token_ids_unprompted.append(token_ids)
+            # check whether to condition on previous text - we do this with probability condition_on_prev_probability
+            condition_on_prev = bool(np.random.binomial(1, condition_on_prev_probability))
+            if not condition_on_prev:
+                prev_ids = None
+            elif "condition_on_prev" not in batch and len(all_token_ids_unprompted) > 1:
+                # prompt ids are the penultimate token ids in the batch
+                prev_ids = all_token_ids_unprompted[-2]
+            if prev_ids is not None:
+                if has_timestamps and not predict_timestamps:
+                    # filter timestamp ids from prompt when not predicting timestamps
+                    prev_ids = [token for token in prev_ids if token < timestamp_begin]
+                # check that the length of the prompt does not exceed more than half the max label length (224)
+                if len(prev_ids) > prompt_cutoff_length:
+                    prev_ids = prev_ids[-prompt_cutoff_length + 1 :]
+                    prev_ids = [decoder_prev_token_id] + prev_ids
+                # and that the total length of the labels does not exceed the max label length (448)
+                if len(prev_ids + token_ids) > max_label_length:
+                    trim_length = len(prev_ids + token_ids) - max_label_length + 1
+                    prev_ids = prev_ids[trim_length:]
+                    prev_ids = [decoder_prev_token_id] + prev_ids
+                token_ids = prev_ids + token_ids
+            all_token_ids.append(token_ids)
+        batch["labels"] = all_token_ids
+        return batch
+    def prepare_eval_dataset(batch):
+        # process audio input
+        sample = batch["audio"]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_features"] = inputs.input_features[0]
+        batch["input_length"] = len(sample["array"])
+        # process targets - for evaluation these are the ground-truth transcriptions
+        input_str = batch["text"]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    if training_args.do_train:
+        # with streaming mode we can only have 1 worker, whereas with non-streaming
+        # we can use `num_workers` (which is much faster)
+        # We gate the pre-processing function accordingly
+        map_fn_train = partial(
+            raw_datasets["train"].map,
+            function=prepare_train_dataset,
+            remove_columns=raw_datasets_train_features,
+            batched=True,
+            batch_size=data_args.preprocessing_batch_size,
+        )
+        with accelerator.main_process_first():
+            vectorized_datasets["train"] = (
+                map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+                if not data_args.streaming
+                else map_fn_train()
+            )
+    if training_args.do_eval:
+        for eval_split in all_eval_splits:
+            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial(
+                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+            )
+            with accelerator.main_process_first():
+                vectorized_datasets[eval_split] = (
+                    map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+                    if not data_args.streaming
+                    else map_fn_eval()
+                )
+    # 10.5: Filter training data with inputs longer than `max_input_length`
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    filter_by_audio_fn = partial(
+        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    )
+    with accelerator.main_process_first():
+        vectorized_datasets = (
+            filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+            if not data_args.streaming
+            else filter_by_audio_fn()
+        )
+    # 10.6: Filter training data with labels longer than `max_label_length`
+    def is_labels_in_length_range(labels):
+        return 0 < len(labels) <= max_label_length
+    filter_by_labels_fn = partial(
+        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    )
+    with accelerator.main_process_first():
+        vectorized_datasets = (
+            filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+            if not data_args.streaming
+            else filter_by_labels_fn()
+        )
+    # Pre-processing complete!
+    # For large datasets it is advised to run the preprocessing on a
+    # single machine first with `--preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step, `--preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        if data_args.streaming:
+            raise ValueError(
+                "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
+                "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
+                "on the fly with streaming mode."
+            )
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 11. Define Evaluation Metrics
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
+        # Normalize everything
+        norm_pred_str = []
+        norm_label_str = []
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nocaptions|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nocaptions|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nocaptions|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nocaptions|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nocaptions|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # 12. Define Training Schedule
+    # Store some constants
+    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    if not data_args.streaming and training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        if not data_args.streaming:
+            steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+            num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
+        else:
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_epochs = sys.maxsize
+            steps_per_epoch = total_train_steps
+    else:
+        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    if training_args.eval_steps is None:
+        logger.info(
+            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+        )
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+    # 13. Define optimizer, LR scheduler, collator
+    forbidden_module = [
+        module
+        for module, flag in [
+            (student_model.model.encoder, training_args.freeze_encoder),
+            (student_model.model.decoder, training_args.freeze_decoder)
+        ]
+        if flag
+    ] or None
+    decay_parameters = get_parameter_names(
+        student_model,
+        [nn.LayerNorm],
+        forbidden_module=forbidden_module,
+    )
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    optimizer_grouped_parameters = [
+        {
+            "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
+            "weight_decay": training_args.weight_decay,
+        },
+        {
+            "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        params=optimizer_grouped_parameters,
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+    )
+    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    lr_scheduler = get_scheduler(
+        name=training_args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
+        num_training_steps=total_train_steps * accelerator.num_processes,
+    )
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=decoder_start_token_id,
+        decoder_prev_token_id=decoder_prev_token_id,
+        input_padding="longest",
+        target_padding="max_length",
+        max_target_length=max_label_length,
+    )
+    # 14. Define generation arguments - we need to do this before we wrap the models in DDP
+    # so that we can still access the configs
+    num_beams = (
+        training_args.generation_num_beams
+        if training_args.generation_num_beams is not None
+        else getattr(student_model.generation_config, "num_beams", 1)
+    )
+    gen_kwargs = {
+        "max_length": max_label_length,
+        "num_beams": num_beams,
+        "return_timestamps": return_timestamps,
+    }
+    if is_multilingual:
+        # forcing the language and task tokens helps multilingual models in their generations
+        gen_kwargs.update(
+            {
+                "language": data_args.language,
+                "task": data_args.task,
+            }
+        )
+    # 15. Prepare everything with accelerate
+    student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
+        student_model, teacher_model, optimizer, lr_scheduler
+    )
+    def kl_divergence(target_distribution, log_predicted_distribution, labels):
+        kl_loss = nn.KLDivLoss(reduction="none")
+        divergence = kl_loss(log_predicted_distribution, target_distribution)
+        # ignore padded tokens from divergence, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        padding_mask = padding_mask.unsqueeze(-1)
+        divergence = divergence * padding_mask
+        # take the average over the mini-batch
+        divergence = divergence.sum() / padding_mask.sum()
+        return divergence
+    # Define gradient update step fn
+    def train_step(
+        batch,
+        temperature=2.0,
+    ):
+        student_model.train()
+        teacher_model.eval()
+        student_outputs = student_model(**batch)
+        with torch.no_grad():
+            if share_hidden_states:
+                # if the student and teacher share the same frozen encoder then we don't have to recompute the
+                # encoder hidden-states for the teacher model, we can just re-use from the student
+                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+            else:
+                # do the full forward pass for the teacher model (encoder + decoder)
+                teacher_outputs = teacher_model(**batch)
+        # CE (data) loss
+        ce_loss = student_outputs.loss
+        # rescale distribution by temperature to ensure gradients scale correctly
+        teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
+        # log softmax of student predictions for numerical stability
+        student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
+        # KL-divergence loss (scaled by temperature)
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
+        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+        return loss, metrics
+    # Define eval fn
+    def eval_step(batch):
+        student_model.eval()
+        teacher_model.eval()
+        with torch.no_grad():
+            student_outputs = student_model(**batch)
+            if share_hidden_states:
+                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+            else:
+                teacher_outputs = teacher_model(**batch)
+        # CE (data) loss
+        ce_loss = student_outputs.loss
+        # log softmax / softmax for numerical stability
+        student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
+        teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
+        # temperature is always 1 for eval
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
+        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+        return metrics
+    def generate_step(batch):
+        student_model.eval()
+        output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
+        output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
+        return output_ids
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    if not data_args.streaming:
+        logger.info(f"  Num epochs = {num_epochs}")
+    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    steps_trained_progress_bar = tqdm(
+        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    )
+    continue_training = True
+    epochs_trained = 0
+    cur_step = 0
+    best_val_wer = np.inf
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    if checkpoint is not None:
+        accelerator.load_state(checkpoint)
+        # Find num steps and epoch from saved state string pattern
+        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+        match = re.search(pattern, checkpoint)
+        cur_step = int(match.group(1))
+        epochs_trained = int(match.group(2))
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {cur_step}")
+        steps_trained_progress_bar.update(cur_step)
+        for epoch in range(0, epochs_trained):
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        if not data_args.streaming and training_args.max_steps < 0:
+            # we know exactly the number of steps per epoch, so can skip through the required number of batches
+            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+        else:
+            # Currently we don't know how many steps we've taken in the current epoch
+            # So we just shuffle the dataset one extra time and start from a fresh epoch
+            # This is "good enough" for our purposes but not fully correct
+            resume_step = None
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    else:
+        resume_step = None
+    for epoch in range(epochs_trained, num_epochs):
+        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        train_dataloader = DataLoader(
+            vectorized_datasets["train"],
+            collate_fn=data_collator,
+            batch_size=per_device_train_batch_size,
+            num_workers=dataloader_num_workers,
+            prefetch_factor=prefetch_factor,
+            pin_memory=training_args.dataloader_pin_memory,
+        )
+        train_dataloader = accelerator.prepare(train_dataloader)
+        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+            train_dataloader.dataset.set_epoch(epoch)
+        if resume_step is not None:
+            # Skip the first N batches in the dataloader when resuming from a checkpoint
+            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+            resume_step = None
+        for batch in train_dataloader:
+            with accelerator.accumulate(student_model):
+                loss, train_metric = train_step(batch, temperature=training_args.temperature)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Check if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                steps_trained_progress_bar.update(1)
+                cur_step += 1
+                if cur_step % training_args.logging_steps == 0:
+                    steps_trained_progress_bar.write(
+                        f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                        f" {train_metric['loss']}, Learning Rate:"
+                        f" {lr_scheduler.get_last_lr()[0]})"
+                    )
+                    log_metric(
+                        accelerator,
+                        metrics=train_metric,
+                        learning_rate=lr_scheduler.get_last_lr()[0],
+                        train_time=train_time + time.time() - train_start,
+                        step=cur_step,
+                        epoch=epoch,
+                        prefix="train",
+                    )
+                # save checkpoint and weights after each save_steps and at the end of training
+                if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+                    intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+                    accelerator.save_state(output_dir=intermediate_dir)
+                    feature_extractor.save_pretrained(intermediate_dir)
+                    tokenizer.save_pretrained(intermediate_dir)
+                    config.save_pretrained(intermediate_dir)
+                    student_model.generation_config.save_pretrained(intermediate_dir)
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
+                        if training_args.push_to_hub:
+                            upload_folder(
+                                folder_path=training_args.output_dir,
+                                repo_id=repo_name,
+                                repo_type="model",
+                                commit_message=f"Saving train state of step {cur_step}",
+                            )
+                if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+                    train_time += time.time() - train_start
+                    student_model.eval()
+                    wer_l, labels_l = [], []
+                    # ======================== Evaluating ==============================
+                    for eval_split in all_eval_splits:
+                        eval_metrics = []
+                        eval_preds = []
+                        eval_labels = []
+                        eval_start = time.time()
+                        validation_dataloader = DataLoader(
+                            vectorized_datasets[eval_split],
+                            collate_fn=data_collator,
+                            batch_size=per_device_eval_batch_size,
+                            drop_last=False,
+                            num_workers=dataloader_num_workers,
+                            prefetch_factor=prefetch_factor,
+                            pin_memory=training_args.dataloader_pin_memory,
+                        )
+                        validation_dataloader = accelerator.prepare(validation_dataloader)
+                        for batch in tqdm(
+                            validation_dataloader,
+                            desc=f"Evaluating {eval_split}...",
+                            position=2,
+                            disable=not accelerator.is_local_main_process,
+                        ):
+                            # Model forward
+                            eval_metric = eval_step(batch)
+                            eval_metric = accelerator.gather_for_metrics(eval_metric)
+                            eval_metrics.append(eval_metric)
+                            # generation
+                            if training_args.predict_with_generate:
+                                generated_ids = generate_step(batch)
+                                # Gather all predictions and targets
+                                generated_ids, labels = accelerator.gather_for_metrics(
+                                    (generated_ids, batch["labels"])
+                                )
+                                eval_preds.extend(generated_ids)
+                                eval_labels.extend(labels)
+                        eval_time = time.time() - eval_start
+                        # normalize eval metrics
+                        eval_metrics = {
+                            key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
+                        }
+                        # compute WER metric
+                        wer_desc = ""
+                        if training_args.predict_with_generate:
+                            wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+                                eval_preds, eval_labels
+                            )
+                            eval_metrics.update(wer_metric)
+                            wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+                            log_pred(
+                                accelerator,
+                                pred_str,
+                                label_str,
+                                norm_pred_str,
+                                norm_label_str,
+                                step=cur_step,
+                                prefix=eval_split,
+                            )
+                        # Print metrics and update progress bar
+                        steps_trained_progress_bar.write(
+                            f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                            f" {wer_desc})"
+                        )
+                        wer_l.append(wer_metric)
+                        labels_l.append(norm_label_str)
+                        log_metric(
+                            accelerator,
+                            metrics=eval_metrics,
+                            train_time=eval_time,
+                            step=cur_step,
+                            epoch=epoch,
+                            prefix=eval_split,
+                        )
+                    # flush the train metrics
+                    train_start = time.time()
+                    # save best checkpoint
+                    numerators = [wer['wer'] * len(labs) for wer, labs in zip(wer_l, labels_l)]
+                    val_wer = sum(numerators) / sum(len(labs) for labs in labels_l)
+                    if val_wer < best_val_wer:
+                        intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}-val-wer-{val_wer:.3f}")
+                        logger.info(f"Saving new best model, validation WER: {val_wer:.3f}")
+                        accelerator.save_state(output_dir=intermediate_dir)
+                        feature_extractor.save_pretrained(intermediate_dir)
+                        tokenizer.save_pretrained(intermediate_dir)
+                        config.save_pretrained(intermediate_dir)
+                        student_model.generation_config.save_pretrained(intermediate_dir)
+                        accelerator.wait_for_everyone()
+                        # remove unnecesary checkpoints, save best model and push to hub
+                        if accelerator.is_main_process:
+                            rotate_checkpoints(training_args.save_best_total_limit, output_dir=training_args.output_dir, sorting_fn=sorted_best_checkpoints)
+                            accelerator.unwrap_model(student_model).save_pretrained(training_args.output_dir)
+                            if training_args.push_to_hub:
+                                upload_folder(
+                                    folder_path=training_args.output_dir,
+                                    repo_id=repo_name,
+                                    repo_type="model",
+                                    commit_message=f"Saving best state, step {cur_step}, val wer {val_wer:.3f}",
+                                )
+                        best_val_wer = val_wer
+                # break condition
+                if cur_step == total_train_steps:
+                    # the model under training_args.output_dir is the best model, let's also save end of training weights
+                    final_weights_dir = os.path.join(training_args.output_dir, "end-of-training-weights")
+                    feature_extractor.save_pretrained(final_weights_dir)
+                    tokenizer.save_pretrained(final_weights_dir)
+                    # save the config and generation config as well
+                    config.save_pretrained(final_weights_dir)
+                    student_model.generation_config.save_pretrained(final_weights_dir)
+                    # un-wrap student model for save
+                    student_model = accelerator.unwrap_model(student_model)
+                    student_model.save_pretrained(final_weights_dir)
+                    if training_args.push_to_hub:
+                        upload_folder(
+                            folder_path=training_args.output_dir,
+                            repo_id=repo_name,
+                            repo_type="model",
+                            commit_message=f"Saving final weights of step {cur_step}",
+                        )
+                    continue_training = False
+                    break
+        if not continue_training:
+            break
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

run_large_training.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env bash
+accelerate launch run_distillation.py \
+  --model_name_or_path "./nb-distil-large-init" \
+  --teacher_model_name_or_path "NbAiLab/nb-whisper-large" \
+  --train_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_large" \
+  --train_dataset_config_name "" \
+  --train_split_name "train" \
+  --eval_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_large" \
+  --eval_dataset_config_name "" \
+  --eval_split_name "validation" \
+  --eval_steps 500 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --learning_rate 0.0003 \
+  --lr_scheduler_type "constant_with_warmup" \
+  --timestamp_probability 0.2 \
+  --condition_on_prev_probability 0.2 \
+  --language "no" \
+  --task "transcribe" \
+  --logging_steps 200 \
+  --save_total_limit 1 \
+  --max_steps 50000 \
+  --wer_threshold 20 \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 32 \
+  --dataloader_num_workers 8 \
+  --preprocessing_num_workers 8 \
+  --ddp_timeout 7200 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --freeze_embed_positions \
+  --streaming True \
+  --push_to_hub

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff