diff --git a/all_results.json b/all_results.json index df25a5013c26e917f11904be50ffdc9ae40d058f..be24968469d9b4561f63ce8be9d1f5926f0f2d59 100644 --- a/all_results.json +++ b/all_results.json @@ -1 +1 @@ -{"eval_bleu": 11.129096758139427} \ No newline at end of file +{"eval_bleu": 11.95496673517359} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors index 92745342de71a3a40cd2828dd577f239c7ae6511..2554220532c46142a0c015bd8f16bde3b8fbe184 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bddd86ae385054205c2f2a87587f749ff2291c4bbefb13e41f4c43188107fc6d +oid sha256:be5bed2c53d7430ddd0df2836d1796227c465e9b817e300083db87512bea7389 size 2460355904 diff --git a/step_10000/added_tokens.json b/step_10000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_10000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_10000/config.json b/step_10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_10000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_10000/generation_config.json b/step_10000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_10000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_10000/model.safetensors b/step_10000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..03898a4d999fba2fbc471311a86f992ceeee32a8 --- /dev/null +++ b/step_10000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef9a3a2d9f945796bff930ac442a117678a0fd38132889e5fb514f8e5e9193e +size 2460355904 diff --git a/step_10000/optimizer.bin b/step_10000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..255b4d04dc69bb656f0c7f775dddb86fcfb44461 --- /dev/null +++ b/step_10000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4742334fc3d58dbc4009bc868c0c63583c0e9e5453b6ba81844edd0338ac6fc +size 4921023445 diff --git a/step_10000/random_states_0.pkl b/step_10000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..68944a646e8b5fc3e57c35ef20216e411bef09f2 --- /dev/null +++ b/step_10000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88c966fa9f249f503826cbf520b3bc0c2bcaf742f45f9d08d4d9424a5130cee1 +size 14344 diff --git a/step_10000/scheduler.bin b/step_10000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..c900624a88ed634c800ce220f19f9116c993fd8a --- /dev/null +++ b/step_10000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ef7490c774f55c093922a513bca6fa45e62f1482c7ad6a2b0ede3013e19c91 +size 1064 diff --git a/step_10000/sentencepiece.bpe.model b/step_10000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_10000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_10000/special_tokens_map.json b/step_10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_10000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_10000/tokenizer.json b/step_10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_10000/tokenizer_config.json b/step_10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_10000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_12000/added_tokens.json b/step_12000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_12000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_12000/config.json b/step_12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_12000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_12000/generation_config.json b/step_12000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_12000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_12000/model.safetensors b/step_12000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..03cd2a861764ca88fab159db561cab6812a1b687 --- /dev/null +++ b/step_12000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f88e583bbaf88b89ce261c7966b6f33e4be24afa217671d8e33e1531eeada11 +size 2460355904 diff --git a/step_12000/optimizer.bin b/step_12000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..ee27c0c50c9aeddea4ab52fcfbc36fd1c9ec5916 --- /dev/null +++ b/step_12000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5378820c5e0212338052c9a6e9c906ea4f19239412293db19ed0f68e0628c107 +size 4921023445 diff --git a/step_12000/random_states_0.pkl b/step_12000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..70835049485b3b5ad5f1a874117c9a9e5e5bc13d --- /dev/null +++ b/step_12000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16002ac07fffe99784491d19c12fa09cd3a611772b410444894884a8ad12e302 +size 14344 diff --git a/step_12000/scheduler.bin b/step_12000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..2569189782c482427423b1fc63543136cfbe17e0 --- /dev/null +++ b/step_12000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:634fc4b6da9606be6a67d31d5d010c77305ccfd8915e9cf75e5b3983a55dfefb +size 1064 diff --git a/step_12000/sentencepiece.bpe.model b/step_12000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_12000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_12000/special_tokens_map.json b/step_12000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_12000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_12000/tokenizer.json b/step_12000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_12000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_12000/tokenizer_config.json b/step_12000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_12000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_14000/added_tokens.json b/step_14000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_14000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_14000/config.json b/step_14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_14000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_14000/generation_config.json b/step_14000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_14000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_14000/model.safetensors b/step_14000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..034ec1b172140b0ef1a81c0f4a58ca0d482fe636 --- /dev/null +++ b/step_14000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ccdc20a71f9e4659eec0ad714eee6abf93c4eb162337b989a9b1c19f10b0cc1 +size 2460355904 diff --git a/step_14000/optimizer.bin b/step_14000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..454fe2518fed8b66aa537fa0c983ea6af75cf489 --- /dev/null +++ b/step_14000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0b7070edb2b38f2b79188b32672f2ca8ee8474a87ab826132366f74f04afa5 +size 4921023445 diff --git a/step_14000/random_states_0.pkl b/step_14000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d2bd000f76a0260c975ddb9a3cdef42c69c55e24 --- /dev/null +++ b/step_14000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d3ead1b0b82fa1fd8addcfecf4b2068399c1bef2c1dc0fc2a790508ef48c39e +size 14344 diff --git a/step_14000/scheduler.bin b/step_14000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..c4d7d56d327b4d0a6c512011c26050d1418430a4 --- /dev/null +++ b/step_14000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6558f4e5fd73d72943c8757c3789e7e04d2a45f40aeb221b4f8aeccef9c4cd87 +size 1064 diff --git a/step_14000/sentencepiece.bpe.model b/step_14000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_14000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_14000/special_tokens_map.json b/step_14000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_14000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_14000/tokenizer.json b/step_14000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_14000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_14000/tokenizer_config.json b/step_14000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_14000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_16000/added_tokens.json b/step_16000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_16000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_16000/config.json b/step_16000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_16000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_16000/generation_config.json b/step_16000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_16000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_16000/model.safetensors b/step_16000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3938de6e7720eafa0df45eabf5df9c50b3ccac90 --- /dev/null +++ b/step_16000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845b8700c25483641803daef5398c93a1a75ab96d7d327061c33864b9e6d3e64 +size 2460355904 diff --git a/step_16000/optimizer.bin b/step_16000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e9babef7cb7571dc6bc15077e484a9148dd4f05 --- /dev/null +++ b/step_16000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05e18d4942c75a9676bca3883cd4a93d25b6e717129e1ed39e4b9db2a9271814 +size 4921023445 diff --git a/step_16000/random_states_0.pkl b/step_16000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..21f417311dc8f2eb16323febb866d151f3bc57c1 --- /dev/null +++ b/step_16000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f0b612c9e7f873952ae58eae881ede1e79fedf2712b9fb75d3e67d019c5cd7 +size 14344 diff --git a/step_16000/scheduler.bin b/step_16000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..3408e9901f983bc0d951b6aacffe702a2a3ef578 --- /dev/null +++ b/step_16000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e87f892b85d02db83a40868471587da14b46502563c4f771fd5e397faf0d39 +size 1064 diff --git a/step_16000/sentencepiece.bpe.model b/step_16000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_16000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_16000/special_tokens_map.json b/step_16000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_16000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_16000/tokenizer.json b/step_16000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_16000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_16000/tokenizer_config.json b/step_16000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_16000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_17000/added_tokens.json b/step_17000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_17000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_17000/config.json b/step_17000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_17000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_17000/generation_config.json b/step_17000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_17000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_17000/model.safetensors b/step_17000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28b036139ffefa1c10fcdcfb1001a31cdc53abf3 --- /dev/null +++ b/step_17000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b807afc7cd93cf9c2ad6b84152298d77249a53c4fb3d6be8fd9d729b320cd2bd +size 2460355904 diff --git a/step_17000/optimizer.bin b/step_17000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..ea2dd63669400aed03b813169476d9ae4bd1e63a --- /dev/null +++ b/step_17000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5083d97a907819afbb6c972c008c82eee1e48621f33aa3c47da49fdcbb250c5 +size 4921023445 diff --git a/step_17000/random_states_0.pkl b/step_17000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..da1f385546f4cce90d5428de161d4b8b8f770696 --- /dev/null +++ b/step_17000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba6c02f807c0fc635ef3c2ed27355c50b060c1e24f822997fc141c42aed6828 +size 14344 diff --git a/step_17000/scheduler.bin b/step_17000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..18cad3a6ecdc725b906a2f643fd5cdc00f3952d5 --- /dev/null +++ b/step_17000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd610bd1b195825e4d4dbbe9be59f7c2a58e02b167298cabcd9ebfd451be7ba7 +size 1064 diff --git a/step_17000/sentencepiece.bpe.model b/step_17000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_17000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_17000/special_tokens_map.json b/step_17000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_17000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_17000/tokenizer.json b/step_17000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_17000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_17000/tokenizer_config.json b/step_17000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_17000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_18000/added_tokens.json b/step_18000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_18000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_18000/config.json b/step_18000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_18000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_18000/generation_config.json b/step_18000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_18000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_18000/model.safetensors b/step_18000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ea7e24af15f876bb27d1e66cd8c048cb9d0fbbe --- /dev/null +++ b/step_18000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efbb7603bb906c586e2c8ca553323b372a4486baf384ca2aa94f9911cf57ca45 +size 2460355904 diff --git a/step_18000/optimizer.bin b/step_18000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..7dae705c4be957cfc1539b862d5927074fc17256 --- /dev/null +++ b/step_18000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eaa43f6f09cd39103be006d02aff0a3dcde08cb343c4587ef07e53a915d634d +size 4921023445 diff --git a/step_18000/random_states_0.pkl b/step_18000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4879b3f14e65653246d4f6e75f8ccf0b90a8c23b --- /dev/null +++ b/step_18000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:090ed52ca8093b7364c9aa2931c9440f8e154605abc8aaf429784e060e10b51c +size 14344 diff --git a/step_18000/scheduler.bin b/step_18000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..3008a5acd449af8f94d65b5312d86ce4414da77d --- /dev/null +++ b/step_18000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5841241e9ec29d86eea02fac6abb6943bbde4e94626d38550d681a8758d5e7cd +size 1064 diff --git a/step_18000/sentencepiece.bpe.model b/step_18000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_18000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_18000/special_tokens_map.json b/step_18000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_18000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_18000/tokenizer.json b/step_18000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_18000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_18000/tokenizer_config.json b/step_18000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_18000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_19000/added_tokens.json b/step_19000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_19000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_19000/config.json b/step_19000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_19000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_19000/generation_config.json b/step_19000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_19000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_19000/model.safetensors b/step_19000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..833995e8da808c928708b167ccc82af2a0685c0d --- /dev/null +++ b/step_19000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7211ba2e29c6b30dcc542c7aef7a7b3993c964a9ea3015a39afc1adedd543c75 +size 2460355904 diff --git a/step_19000/optimizer.bin b/step_19000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..367fff51e30d77a61563d762861375b36e99e3c9 --- /dev/null +++ b/step_19000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8acb660cbe1cf2dc1ba11b7c3cc10f8ffb673ddbe3ffa6007f6e25347178140a +size 4921023445 diff --git a/step_19000/random_states_0.pkl b/step_19000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..13d95570eb374bd267a7dec75ab4334ace11e989 --- /dev/null +++ b/step_19000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0144f8dba12b6a53f38f88bc3d6a98620bf80c3430f327d7394a84df00bccb9b +size 14344 diff --git a/step_19000/scheduler.bin b/step_19000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..ff67c2b9e669a81aeb47d06ff049ef525c2975ec --- /dev/null +++ b/step_19000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad87e2e5fad23d49dee27b459553c6e54228793c0d5cfa2edfd638a413ad0567 +size 1064 diff --git a/step_19000/sentencepiece.bpe.model b/step_19000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_19000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_19000/special_tokens_map.json b/step_19000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_19000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_19000/tokenizer.json b/step_19000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_19000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_19000/tokenizer_config.json b/step_19000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_19000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_2000/added_tokens.json b/step_2000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_2000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_2000/config.json b/step_2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_2000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_2000/generation_config.json b/step_2000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_2000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_2000/model.safetensors b/step_2000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02587f40d712a8f011d84d68eeee5b4c6f9fd8d9 --- /dev/null +++ b/step_2000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b08fa80a2b4947f162188d17182e41a785151f1c350fc19fa0768081bcaed7cb +size 2460355904 diff --git a/step_2000/optimizer.bin b/step_2000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f99d22db1ab178469f1f4691a7bc551fc0f5bbd --- /dev/null +++ b/step_2000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:238be9b3cb67319799a27791c7bbddf6251ea3285f2271d6ff3fbb5b5912ffd0 +size 4921023445 diff --git a/step_2000/random_states_0.pkl b/step_2000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c684561cdc8b56bf139a8e1da9f0f095fae84b6c --- /dev/null +++ b/step_2000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fbd896ce644a0f534bbfb95e11bb1a3434a391cd48d89f1ed54a85eb8c80098 +size 14344 diff --git a/step_2000/scheduler.bin b/step_2000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..47bbae42c59f2faa6c176f92c78fd6bcdb58d1bf --- /dev/null +++ b/step_2000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb2b3b063e72a3f249f585ce4d7542399df25ecbced7bc8eaca9e5a1c2fe977 +size 1064 diff --git a/step_2000/sentencepiece.bpe.model b/step_2000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_2000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_2000/special_tokens_map.json b/step_2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_2000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_2000/tokenizer.json b/step_2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_2000/tokenizer_config.json b/step_2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_2000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_20000/added_tokens.json b/step_20000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_20000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_20000/config.json b/step_20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_20000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_20000/generation_config.json b/step_20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_20000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_20000/model.safetensors b/step_20000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2554220532c46142a0c015bd8f16bde3b8fbe184 --- /dev/null +++ b/step_20000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be5bed2c53d7430ddd0df2836d1796227c465e9b817e300083db87512bea7389 +size 2460355904 diff --git a/step_20000/optimizer.bin b/step_20000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0a7109ff7b1e6cff94f385e7e984603dee60704 --- /dev/null +++ b/step_20000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b5409b4795094778bd6b3085b2cf3d30a83c10663c6e3543d03c802314910f +size 4921023445 diff --git a/step_20000/random_states_0.pkl b/step_20000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..61d8912ee34be9d3c987501d58462cb741546166 --- /dev/null +++ b/step_20000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbae6bc4d95bd437aebad6b98870f75762a1615047903af3374e7fdd2df35e72 +size 14344 diff --git a/step_20000/scheduler.bin b/step_20000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..f2b6569383ee86b95cae1efa1d9a48486ba9544e --- /dev/null +++ b/step_20000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d318211cbcb28b979dd55bb60cc12068ffb6765dd97afe24b277f2f157924af +size 1064 diff --git a/step_20000/sentencepiece.bpe.model b/step_20000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_20000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_20000/special_tokens_map.json b/step_20000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_20000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_20000/tokenizer.json b/step_20000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_20000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_20000/tokenizer_config.json b/step_20000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_20000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_4000/added_tokens.json b/step_4000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_4000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_4000/config.json b/step_4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_4000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_4000/generation_config.json b/step_4000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_4000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_4000/model.safetensors b/step_4000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d75af67de9d639de02b52c9c3a9a0b39e3377d80 --- /dev/null +++ b/step_4000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8bb4fea61b0a4ffc9ac709e0ca0728975ac96b7a8473c8b6b426cd1eec36e65 +size 2460355904 diff --git a/step_4000/optimizer.bin b/step_4000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c8392e4ab20c319981c897199b9e29322e258c3 --- /dev/null +++ b/step_4000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88886d33223f97299e4c13c61d5ee2f191a98c42776acfdb57c7f97691c5baf9 +size 4921023445 diff --git a/step_4000/random_states_0.pkl b/step_4000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..31917b38a7a9763a52536f9f69cb3f8e7278d57a --- /dev/null +++ b/step_4000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131992bffbaae8cb39a6b68a6bf6f615f27211937621d14ab541335de8f7c52f +size 14344 diff --git a/step_4000/scheduler.bin b/step_4000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..f27a5ccc30ddeeda3ace3317f0fd7eb4460bd755 --- /dev/null +++ b/step_4000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a55632bddd31c4eded8bf55551b83ebeb88c2d8c13e3aa4fb1f69f04a529d724 +size 1064 diff --git a/step_4000/sentencepiece.bpe.model b/step_4000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_4000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_4000/special_tokens_map.json b/step_4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_4000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_4000/tokenizer.json b/step_4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_4000/tokenizer_config.json b/step_4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_4000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_6000/added_tokens.json b/step_6000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_6000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_6000/config.json b/step_6000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_6000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_6000/generation_config.json b/step_6000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_6000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_6000/model.safetensors b/step_6000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bfcfe8a9afc07d9038e19098385cb3bafa5a7877 --- /dev/null +++ b/step_6000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f5716f44b26acbb3c5bf04d25c8d98bd161b687560f6a1a35196067751ae505 +size 2460355904 diff --git a/step_6000/optimizer.bin b/step_6000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a816ece38862721df8a755537e9c4d8bf91c7f9 --- /dev/null +++ b/step_6000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdea4c016ba64ea02b25c564928e2dbb33dff4c30753e2fc4637c1283616bd63 +size 4921023445 diff --git a/step_6000/random_states_0.pkl b/step_6000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..40a9959f569c9bdcc877b84ed1371d15b8a7cecf --- /dev/null +++ b/step_6000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cc0d64f027ae41e69ad82759960c3284cf0b454484f121129c16004f7b3d055 +size 14344 diff --git a/step_6000/scheduler.bin b/step_6000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..92c5c8e3957a3fe0db8dd7df1df415df52976874 --- /dev/null +++ b/step_6000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da8ef95519b9f9f3db19158c5cd9b46240a0bf59e730b7934cf6978b804c29e +size 1064 diff --git a/step_6000/sentencepiece.bpe.model b/step_6000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_6000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_6000/special_tokens_map.json b/step_6000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_6000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_6000/tokenizer.json b/step_6000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_6000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_6000/tokenizer_config.json b/step_6000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_6000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +} diff --git a/step_8000/added_tokens.json b/step_8000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bec714548f527774aeb27e57c4db291ff27e6b --- /dev/null +++ b/step_8000/added_tokens.json @@ -0,0 +1,204 @@ +{ + "__ace_Latn__": 256002, + "__ace__": 256001, + "__acm__": 256003, + "__acq__": 256004, + "__aeb__": 256005, + "__afr__": 256006, + "__ajp__": 256007, + "__aka__": 256008, + "__als__": 256162, + "__amh__": 256009, + "__apc__": 256010, + "__arb__": 256011, + "__ars__": 256012, + "__ary__": 256013, + "__arz__": 256014, + "__asm__": 256015, + "__ast__": 256016, + "__awa__": 256017, + "__ayr__": 256018, + "__azb__": 256019, + "__azj__": 256020, + "__bak__": 256021, + "__bam__": 256022, + "__ban__": 256023, + "__bel__": 256024, + "__bem__": 256025, + "__ben__": 256026, + "__bho__": 256027, + "__bjn_Latn__": 256029, + "__bjn__": 256028, + "__bod__": 256030, + "__bos__": 256031, + "__bug__": 256032, + "__bul__": 256033, + "__cat__": 256034, + "__ceb__": 256035, + "__ces__": 256036, + "__cjk__": 256037, + "__ckb__": 256038, + "__cmn_Hant__": 256201, + "__cmn__": 256200, + "__crh__": 256039, + "__cym__": 256040, + "__dan__": 256041, + "__deu__": 256042, + "__dik__": 256043, + "__dyu__": 256044, + "__dzo__": 256045, + "__ell__": 256046, + "__eng__": 256047, + "__epo__": 256048, + "__est__": 256049, + "__eus__": 256050, + "__ewe__": 256051, + "__fao__": 256052, + "__fij__": 256054, + "__fin__": 256055, + "__fon__": 256056, + "__fra__": 256057, + "__fur__": 256058, + "__fuv__": 256059, + "__gaz__": 256135, + "__gla__": 256060, + "__gle__": 256061, + "__glg__": 256062, + "__grn__": 256063, + "__guj__": 256064, + "__hat__": 256065, + "__hau__": 256066, + "__heb__": 256067, + "__hin__": 256068, + "__hne__": 256069, + "__hrv__": 256070, + "__hun__": 256071, + "__hye__": 256072, + "__ibo__": 256073, + "__ilo__": 256074, + "__ind__": 256075, + "__isl__": 256076, + "__ita__": 256077, + "__jav__": 256078, + "__jpn__": 256079, + "__kab__": 256080, + "__kac__": 256081, + "__kam__": 256082, + "__kan__": 256083, + "__kas_Deva__": 256085, + "__kas__": 256084, + "__kat__": 256086, + "__kaz__": 256089, + "__kbp__": 256090, + "__kea__": 256091, + "__khk__": 256122, + "__khm__": 256092, + "__kik__": 256093, + "__kin__": 256094, + "__kir__": 256095, + "__kmb__": 256096, + "__kmr__": 256099, + "__knc_Latn__": 256088, + "__knc__": 256087, + "__kon__": 256097, + "__kor__": 256098, + "__lao__": 256100, + "__lij__": 256102, + "__lim__": 256103, + "__lin__": 256104, + "__lit__": 256105, + "__lmo__": 256106, + "__ltg__": 256107, + "__ltz__": 256108, + "__lua__": 256109, + "__lug__": 256110, + "__luo__": 256111, + "__lus__": 256112, + "__lvs__": 256101, + "__mag__": 256113, + "__mai__": 256114, + "__mal__": 256115, + "__mar__": 256116, + "__min__": 256117, + "__mkd__": 256118, + "__mlt__": 256120, + "__mni__": 256121, + "__mos__": 256123, + "__mri__": 256124, + "__mya__": 256126, + "__nld__": 256127, + "__nno__": 256128, + "__nob__": 256129, + "__npi__": 256130, + "__nso__": 256131, + "__nus__": 256132, + "__nya__": 256133, + "__oci__": 256134, + "__ory__": 256136, + "__pag__": 256137, + "__pan__": 256138, + "__pap__": 256139, + "__pbt__": 256143, + "__pes__": 256053, + "__plt__": 256119, + "__pol__": 256140, + "__por__": 256141, + "__prs__": 256142, + "__quy__": 256144, + "__ron__": 256145, + "__run__": 256146, + "__rus__": 256147, + "__sag__": 256148, + "__san__": 256149, + "__sat__": 256150, + "__scn__": 256151, + "__shn__": 256152, + "__sin__": 256153, + "__slk__": 256154, + "__slv__": 256155, + "__smo__": 256156, + "__sna__": 256157, + "__snd__": 256158, + "__som__": 256159, + "__sot__": 256160, + "__spa__": 256161, + "__srd__": 256163, + "__srp__": 256164, + "__ssw__": 256165, + "__sun__": 256166, + "__swe__": 256167, + "__swh__": 256168, + "__szl__": 256169, + "__tam__": 256170, + "__taq_Tfng__": 256178, + "__taq__": 256177, + "__tat__": 256171, + "__tel__": 256172, + "__tgk__": 256173, + "__tgl__": 256174, + "__tha__": 256175, + "__tir__": 256176, + "__tpi__": 256179, + "__tsn__": 256180, + "__tso__": 256181, + "__tuk__": 256182, + "__tum__": 256183, + "__tur__": 256184, + "__twi__": 256185, + "__tzm__": 256186, + "__uig__": 256187, + "__ukr__": 256188, + "__umb__": 256189, + "__urd__": 256190, + "__uzn__": 256191, + "__vec__": 256192, + "__vie__": 256193, + "__war__": 256194, + "__wol__": 256195, + "__xho__": 256196, + "__ydd__": 256197, + "__yor__": 256198, + "__yue__": 256199, + "__zsm__": 256125, + "__zul__": 256202 +} diff --git a/step_8000/config.json b/step_8000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a469663221dfab7d1bc1580a0a9d0afd263e356b --- /dev/null +++ b/step_8000/config.json @@ -0,0 +1,115 @@ +{ + "_name_or_path": "facebook/hf-seamless-m4t-medium", + "activation_dropout": 0.0, + "activation_function": "relu", + "adaptor_dropout": 0.1, + "adaptor_kernel_size": 8, + "adaptor_stride": 8, + "add_adapter": true, + "architectures": [ + "SeamlessM4TForTextToText" + ], + "attention_dropout": 0.1, + "bos_token_id": 2, + "conv_depthwise_kernel_size": 31, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.05, + "decoder_layers": 12, + "decoder_start_token_id": 3, + "dropout": 0.1, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.05, + "encoder_layers": 12, + "eos_token_id": 3, + "feature_projection_input_dim": 160, + "hidden_size": 1024, + "initializer_range": 0.02, + "is_encoder_decoder": true, + "lang_embed_dim": 256, + "layer_norm_eps": 1e-05, + "leaky_relu_slope": 0.1, + "max_new_tokens": 256, + "max_position_embeddings": 4096, + "max_source_positions": 4096, + "model_type": "seamless_m4t", + "num_adapter_layers": 1, + "num_attention_heads": 16, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_hidden_layers": 12, + "pad_token_id": 0, + "position_embeddings_type": "relative", + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "rotary_embedding_base": 10000, + "sampling_rate": 16000, + "scale_embedding": true, + "speech_encoder_attention_heads": 16, + "speech_encoder_dropout": 0.0, + "speech_encoder_hidden_act": "swish", + "speech_encoder_intermediate_size": 4096, + "speech_encoder_layerdrop": 0.1, + "speech_encoder_layers": 12, + "spkr_embed_dim": 256, + "t2u_bos_token_id": 0, + "t2u_decoder_attention_heads": 16, + "t2u_decoder_ffn_dim": 8192, + "t2u_decoder_layers": 4, + "t2u_decoder_start_token_id": 2, + "t2u_encoder_attention_heads": 16, + "t2u_encoder_ffn_dim": 8192, + "t2u_encoder_layers": 4, + "t2u_eos_token_id": 2, + "t2u_max_new_tokens": 1024, + "t2u_max_position_embeddings": 2048, + "t2u_pad_token_id": 1, + "t2u_vocab_size": 10082, + "torch_dtype": "float32", + "transformers_version": "4.43.3", + "unit_embed_dim": 1280, + "unit_hifi_gan_vocab_size": 10000, + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 11, + 8, + 8, + 4, + 4 + ], + "upsample_rates": [ + 5, + 4, + 4, + 2, + 2 + ], + "use_cache": true, + "var_pred_dropout": 0.5, + "variance_predictor_kernel_size": 3, + "vocab_size": 256206, + "vocoder_num_langs": 36, + "vocoder_num_spkrs": 200, + "vocoder_offset": 4 +} diff --git a/step_8000/generation_config.json b/step_8000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..994647aa75ed4ed9ab4c7e2c6fd4aa5db7a1fe7c --- /dev/null +++ b/step_8000/generation_config.json @@ -0,0 +1,290 @@ +{ + "bos_token_id": 2, + "decoder_start_token_id": 3, + "eos_token_id": 3, + "max_new_tokens": 256, + "pad_token_id": 0, + "t2u_lang_code_to_id": { + "arb": 10043, + "ben": 10044, + "cat": 10045, + "ces": 10046, + "cmn": 10047, + "cym": 10048, + "dan": 10049, + "deu": 10050, + "eng": 10051, + "est": 10052, + "fin": 10053, + "fra": 10054, + "hin": 10055, + "ind": 10056, + "ita": 10057, + "jpn": 10058, + "kan": 10059, + "kor": 10060, + "mlt": 10061, + "nld": 10062, + "pes": 10063, + "pol": 10064, + "por": 10065, + "ron": 10066, + "rus": 10067, + "slk": 10068, + "spa": 10069, + "swe": 10070, + "swh": 10071, + "tam": 10072, + "tel": 10073, + "tgl": 10074, + "tha": 10075, + "tur": 10076, + "ukr": 10077, + "urd": 10078, + "uzn": 10079, + "vie": 10080 + }, + "text_decoder_lang_to_code_id": { + "ace": 256001, + "ace_Latn": 256002, + "acm": 256003, + "acq": 256004, + "aeb": 256005, + "afr": 256006, + "ajp": 256007, + "aka": 256008, + "als": 256162, + "amh": 256009, + "apc": 256010, + "arb": 256011, + "ars": 256012, + "ary": 256013, + "arz": 256014, + "asm": 256015, + "ast": 256016, + "awa": 256017, + "ayr": 256018, + "azb": 256019, + "azj": 256020, + "bak": 256021, + "bam": 256022, + "ban": 256023, + "bel": 256024, + "bem": 256025, + "ben": 256026, + "bho": 256027, + "bjn": 256028, + "bjn_Latn": 256029, + "bod": 256030, + "bos": 256031, + "bug": 256032, + "bul": 256033, + "cat": 256034, + "ceb": 256035, + "ces": 256036, + "cjk": 256037, + "ckb": 256038, + "cmn": 256200, + "cmn_Hant": 256201, + "crh": 256039, + "cym": 256040, + "dan": 256041, + "deu": 256042, + "dik": 256043, + "dyu": 256044, + "dzo": 256045, + "ell": 256046, + "eng": 256047, + "epo": 256048, + "est": 256049, + "eus": 256050, + "ewe": 256051, + "fao": 256052, + "fij": 256054, + "fin": 256055, + "fon": 256056, + "fra": 256057, + "fur": 256058, + "fuv": 256059, + "gaz": 256135, + "gla": 256060, + "gle": 256061, + "glg": 256062, + "grn": 256063, + "guj": 256064, + "hat": 256065, + "hau": 256066, + "heb": 256067, + "hin": 256068, + "hne": 256069, + "hrv": 256070, + "hun": 256071, + "hye": 256072, + "ibo": 256073, + "ilo": 256074, + "ind": 256075, + "isl": 256076, + "ita": 256077, + "jav": 256078, + "jpn": 256079, + "kab": 256080, + "kac": 256081, + "kam": 256082, + "kan": 256083, + "kas": 256084, + "kas_Deva": 256085, + "kat": 256086, + "kaz": 256089, + "kbp": 256090, + "kea": 256091, + "khk": 256122, + "khm": 256092, + "kik": 256093, + "kin": 256094, + "kir": 256095, + "kmb": 256096, + "kmr": 256099, + "knc": 256087, + "knc_Latn": 256088, + "kon": 256097, + "kor": 256098, + "lao": 256100, + "lij": 256102, + "lim": 256103, + "lin": 256104, + "lit": 256105, + "lmo": 256106, + "ltg": 256107, + "ltz": 256108, + "lua": 256109, + "lug": 256110, + "luo": 256111, + "lus": 256112, + "lvs": 256101, + "mag": 256113, + "mai": 256114, + "mal": 256115, + "mar": 256116, + "min": 256117, + "mkd": 256118, + "mlt": 256120, + "mni": 256121, + "mos": 256123, + "mri": 256124, + "mya": 256126, + "nld": 256127, + "nno": 256128, + "nob": 256129, + "npi": 256130, + "nso": 256131, + "nus": 256132, + "nya": 256133, + "oci": 256134, + "ory": 256136, + "pag": 256137, + "pan": 256138, + "pap": 256139, + "pbt": 256143, + "pes": 256053, + "plt": 256119, + "pol": 256140, + "por": 256141, + "prs": 256142, + "quy": 256144, + "ron": 256145, + "run": 256146, + "rus": 256147, + "sag": 256148, + "san": 256149, + "sat": 256150, + "scn": 256151, + "shn": 256152, + "sin": 256153, + "slk": 256154, + "slv": 256155, + "smo": 256156, + "sna": 256157, + "snd": 256158, + "som": 256159, + "sot": 256160, + "spa": 256161, + "srd": 256163, + "srp": 256164, + "ssw": 256165, + "sun": 256166, + "swe": 256167, + "swh": 256168, + "szl": 256169, + "tam": 256170, + "taq": 256177, + "taq_Tfng": 256178, + "tat": 256171, + "tel": 256172, + "tgk": 256173, + "tgl": 256174, + "tha": 256175, + "tir": 256176, + "tpi": 256179, + "tsn": 256180, + "tso": 256181, + "tuk": 256182, + "tum": 256183, + "tur": 256184, + "twi": 256185, + "tzm": 256186, + "uig": 256187, + "ukr": 256188, + "umb": 256189, + "urd": 256190, + "uzn": 256191, + "vec": 256192, + "vie": 256193, + "war": 256194, + "wol": 256195, + "xho": 256196, + "ydd": 256197, + "yor": 256198, + "yue": 256199, + "zsm": 256125, + "zul": 256202 + }, + "transformers_version": "4.43.3", + "vocoder_lang_code_to_id": { + "arb": 0, + "ben": 1, + "cat": 2, + "ces": 3, + "cmn": 4, + "cym": 5, + "dan": 6, + "deu": 7, + "eng": 8, + "est": 9, + "fin": 10, + "fra": 11, + "hin": 12, + "ind": 13, + "ita": 14, + "jpn": 15, + "kor": 16, + "mlt": 17, + "nld": 18, + "pes": 19, + "pol": 20, + "por": 21, + "ron": 22, + "rus": 23, + "slk": 24, + "spa": 25, + "swe": 26, + "swh": 27, + "tel": 28, + "tgl": 29, + "tha": 30, + "tur": 31, + "ukr": 32, + "urd": 33, + "uzn": 34, + "vie": 35 + } +} diff --git a/step_8000/model.safetensors b/step_8000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f72838f2212ae46195c71e8d89f32542136d7e81 --- /dev/null +++ b/step_8000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e071b136354406dfb6148165d65a6475293c5d8d970f4889921173d322cb9102 +size 2460355904 diff --git a/step_8000/optimizer.bin b/step_8000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..80c58bc11d60047ee59654e9d97191aba5f9d37a --- /dev/null +++ b/step_8000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:136c3c0aa18d8ed3daaa0eb4283df910b2c83f37df89698efc41683162102cd7 +size 4921023445 diff --git a/step_8000/random_states_0.pkl b/step_8000/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1f1125b6df03ec2a173c191f172540b834dfc51c --- /dev/null +++ b/step_8000/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde2154604791bb186fbd330ec41e6663f8ca302ec2ac9bf19f8d6fed92c9258 +size 14344 diff --git a/step_8000/scheduler.bin b/step_8000/scheduler.bin new file mode 100644 index 0000000000000000000000000000000000000000..b2ffe406917122026d693ab3d03199f92b7accf3 --- /dev/null +++ b/step_8000/scheduler.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:764517e21d9a1d739c769c3bb6fbdd66e75980ad49a803d4fd3f0681fc557607 +size 1064 diff --git a/step_8000/sentencepiece.bpe.model b/step_8000/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..dc2262d3e1d375b235eb71c24119c8e73f85d4ad --- /dev/null +++ b/step_8000/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a +size 4852054 diff --git a/step_8000/special_tokens_map.json b/step_8000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9d884345949fd4badced21e2f4fb30b67ceba3b2 --- /dev/null +++ b/step_8000/special_tokens_map.json @@ -0,0 +1,252 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/step_8000/tokenizer.json b/step_8000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..311a92ad3ac59761f554eff5918284c67d602cb9 --- /dev/null +++ b/step_8000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ce3deacc5ca45173811ce104786501982fd65dd9d72a3f458965391f2a52a +size 17325605 diff --git a/step_8000/tokenizer_config.json b/step_8000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4383b1cb97310e9e9c719676ae4085e1a1cc2d --- /dev/null +++ b/step_8000/tokenizer_config.json @@ -0,0 +1,1874 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": "__ace__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256002": { + "content": "__ace_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256003": { + "content": "__acm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256004": { + "content": "__acq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256005": { + "content": "__aeb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256006": { + "content": "__afr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256007": { + "content": "__ajp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256008": { + "content": "__aka__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256009": { + "content": "__amh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256010": { + "content": "__apc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256011": { + "content": "__arb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256012": { + "content": "__ars__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256013": { + "content": "__ary__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256014": { + "content": "__arz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256015": { + "content": "__asm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256016": { + "content": "__ast__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256017": { + "content": "__awa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256018": { + "content": "__ayr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256019": { + "content": "__azb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256020": { + "content": "__azj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256021": { + "content": "__bak__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256022": { + "content": "__bam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256023": { + "content": "__ban__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256024": { + "content": "__bel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256025": { + "content": "__bem__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256026": { + "content": "__ben__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256027": { + "content": "__bho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256028": { + "content": "__bjn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256029": { + "content": "__bjn_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256030": { + "content": "__bod__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256031": { + "content": "__bos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256032": { + "content": "__bug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256033": { + "content": "__bul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256034": { + "content": "__cat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256035": { + "content": "__ceb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256036": { + "content": "__ces__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256037": { + "content": "__cjk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256038": { + "content": "__ckb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256039": { + "content": "__crh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256040": { + "content": "__cym__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256041": { + "content": "__dan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256042": { + "content": "__deu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256043": { + "content": "__dik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256044": { + "content": "__dyu__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256045": { + "content": "__dzo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256046": { + "content": "__ell__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256047": { + "content": "__eng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256048": { + "content": "__epo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256049": { + "content": "__est__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256050": { + "content": "__eus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256051": { + "content": "__ewe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256052": { + "content": "__fao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256053": { + "content": "__pes__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256054": { + "content": "__fij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256055": { + "content": "__fin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256056": { + "content": "__fon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256057": { + "content": "__fra__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256058": { + "content": "__fur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256059": { + "content": "__fuv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256060": { + "content": "__gla__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256061": { + "content": "__gle__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256062": { + "content": "__glg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256063": { + "content": "__grn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256064": { + "content": "__guj__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256065": { + "content": "__hat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256066": { + "content": "__hau__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256067": { + "content": "__heb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256068": { + "content": "__hin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256069": { + "content": "__hne__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256070": { + "content": "__hrv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256071": { + "content": "__hun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256072": { + "content": "__hye__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256073": { + "content": "__ibo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256074": { + "content": "__ilo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256075": { + "content": "__ind__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256076": { + "content": "__isl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256077": { + "content": "__ita__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256078": { + "content": "__jav__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256079": { + "content": "__jpn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256080": { + "content": "__kab__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256081": { + "content": "__kac__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256082": { + "content": "__kam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256083": { + "content": "__kan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256084": { + "content": "__kas__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256085": { + "content": "__kas_Deva__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256086": { + "content": "__kat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256087": { + "content": "__knc__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256088": { + "content": "__knc_Latn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256089": { + "content": "__kaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256090": { + "content": "__kbp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256091": { + "content": "__kea__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256092": { + "content": "__khm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256093": { + "content": "__kik__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256094": { + "content": "__kin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256095": { + "content": "__kir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256096": { + "content": "__kmb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256097": { + "content": "__kon__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256098": { + "content": "__kor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256099": { + "content": "__kmr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256100": { + "content": "__lao__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256101": { + "content": "__lvs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256102": { + "content": "__lij__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256103": { + "content": "__lim__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256104": { + "content": "__lin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256105": { + "content": "__lit__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256106": { + "content": "__lmo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256107": { + "content": "__ltg__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256108": { + "content": "__ltz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256109": { + "content": "__lua__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256110": { + "content": "__lug__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256111": { + "content": "__luo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256112": { + "content": "__lus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256113": { + "content": "__mag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256114": { + "content": "__mai__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256115": { + "content": "__mal__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256116": { + "content": "__mar__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256117": { + "content": "__min__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256118": { + "content": "__mkd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256119": { + "content": "__plt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256120": { + "content": "__mlt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256121": { + "content": "__mni__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256122": { + "content": "__khk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256123": { + "content": "__mos__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256124": { + "content": "__mri__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256125": { + "content": "__zsm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256126": { + "content": "__mya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256127": { + "content": "__nld__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256128": { + "content": "__nno__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256129": { + "content": "__nob__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256130": { + "content": "__npi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256131": { + "content": "__nso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256132": { + "content": "__nus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256133": { + "content": "__nya__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256134": { + "content": "__oci__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256135": { + "content": "__gaz__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256136": { + "content": "__ory__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256137": { + "content": "__pag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256138": { + "content": "__pan__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256139": { + "content": "__pap__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256140": { + "content": "__pol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256141": { + "content": "__por__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256142": { + "content": "__prs__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256143": { + "content": "__pbt__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256144": { + "content": "__quy__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256145": { + "content": "__ron__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256146": { + "content": "__run__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256147": { + "content": "__rus__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256148": { + "content": "__sag__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256149": { + "content": "__san__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256150": { + "content": "__sat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256151": { + "content": "__scn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256152": { + "content": "__shn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256153": { + "content": "__sin__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256154": { + "content": "__slk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256155": { + "content": "__slv__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256156": { + "content": "__smo__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256157": { + "content": "__sna__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256158": { + "content": "__snd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256159": { + "content": "__som__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256160": { + "content": "__sot__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256161": { + "content": "__spa__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256162": { + "content": "__als__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256163": { + "content": "__srd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256164": { + "content": "__srp__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256165": { + "content": "__ssw__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256166": { + "content": "__sun__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256167": { + "content": "__swe__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256168": { + "content": "__swh__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256169": { + "content": "__szl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256170": { + "content": "__tam__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256171": { + "content": "__tat__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256172": { + "content": "__tel__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256173": { + "content": "__tgk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256174": { + "content": "__tgl__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256175": { + "content": "__tha__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256176": { + "content": "__tir__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256177": { + "content": "__taq__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256178": { + "content": "__taq_Tfng__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256179": { + "content": "__tpi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256180": { + "content": "__tsn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256181": { + "content": "__tso__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256182": { + "content": "__tuk__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256183": { + "content": "__tum__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256184": { + "content": "__tur__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256185": { + "content": "__twi__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256186": { + "content": "__tzm__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256187": { + "content": "__uig__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256188": { + "content": "__ukr__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256189": { + "content": "__umb__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256190": { + "content": "__urd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256191": { + "content": "__uzn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256192": { + "content": "__vec__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256193": { + "content": "__vie__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256194": { + "content": "__war__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256195": { + "content": "__wol__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256196": { + "content": "__xho__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256197": { + "content": "__ydd__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256198": { + "content": "__yor__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256199": { + "content": "__yue__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256200": { + "content": "__cmn__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256201": { + "content": "__cmn_Hant__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "256202": { + "content": "__zul__", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "__ace__", + "__ace_Latn__", + "__acm__", + "__acq__", + "__aeb__", + "__afr__", + "__ajp__", + "__aka__", + "__amh__", + "__apc__", + "__arb__", + "__ars__", + "__ary__", + "__arz__", + "__asm__", + "__ast__", + "__awa__", + "__ayr__", + "__azb__", + "__azj__", + "__bak__", + "__bam__", + "__ban__", + "__bel__", + "__bem__", + "__ben__", + "__bho__", + "__bjn__", + "__bjn_Latn__", + "__bod__", + "__bos__", + "__bug__", + "__bul__", + "__cat__", + "__ceb__", + "__ces__", + "__cjk__", + "__ckb__", + "__crh__", + "__cym__", + "__dan__", + "__deu__", + "__dik__", + "__dyu__", + "__dzo__", + "__ell__", + "__eng__", + "__epo__", + "__est__", + "__eus__", + "__ewe__", + "__fao__", + "__pes__", + "__fij__", + "__fin__", + "__fon__", + "__fra__", + "__fur__", + "__fuv__", + "__gla__", + "__gle__", + "__glg__", + "__grn__", + "__guj__", + "__hat__", + "__hau__", + "__heb__", + "__hin__", + "__hne__", + "__hrv__", + "__hun__", + "__hye__", + "__ibo__", + "__ilo__", + "__ind__", + "__isl__", + "__ita__", + "__jav__", + "__jpn__", + "__kab__", + "__kac__", + "__kam__", + "__kan__", + "__kas__", + "__kas_Deva__", + "__kat__", + "__knc__", + "__knc_Latn__", + "__kaz__", + "__kbp__", + "__kea__", + "__khm__", + "__kik__", + "__kin__", + "__kir__", + "__kmb__", + "__kon__", + "__kor__", + "__kmr__", + "__lao__", + "__lvs__", + "__lij__", + "__lim__", + "__lin__", + "__lit__", + "__lmo__", + "__ltg__", + "__ltz__", + "__lua__", + "__lug__", + "__luo__", + "__lus__", + "__mag__", + "__mai__", + "__mal__", + "__mar__", + "__min__", + "__mkd__", + "__plt__", + "__mlt__", + "__mni__", + "__khk__", + "__mos__", + "__mri__", + "__zsm__", + "__mya__", + "__nld__", + "__nno__", + "__nob__", + "__npi__", + "__nso__", + "__nus__", + "__nya__", + "__oci__", + "__gaz__", + "__ory__", + "__pag__", + "__pan__", + "__pap__", + "__pol__", + "__por__", + "__prs__", + "__pbt__", + "__quy__", + "__ron__", + "__run__", + "__rus__", + "__sag__", + "__san__", + "__sat__", + "__scn__", + "__shn__", + "__sin__", + "__slk__", + "__slv__", + "__smo__", + "__sna__", + "__snd__", + "__som__", + "__sot__", + "__spa__", + "__als__", + "__srd__", + "__srp__", + "__ssw__", + "__sun__", + "__swe__", + "__swh__", + "__szl__", + "__tam__", + "__tat__", + "__tel__", + "__tgk__", + "__tgl__", + "__tha__", + "__tir__", + "__taq__", + "__taq_Tfng__", + "__tpi__", + "__tsn__", + "__tso__", + "__tuk__", + "__tum__", + "__tur__", + "__twi__", + "__tzm__", + "__uig__", + "__ukr__", + "__umb__", + "__urd__", + "__uzn__", + "__vec__", + "__vie__", + "__war__", + "__wol__", + "__xho__", + "__ydd__", + "__yor__", + "__yue__", + "__cmn__", + "__cmn_Hant__", + "__zul__" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "SeamlessM4TProcessor", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "__dan__", + "tgt_lang": "__fra__", + "tokenizer_class": "SeamlessM4TTokenizer", + "unk_token": "" +}