Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +2 -0
config.json +159 -0
tokenizer.py +836 -0
xtts-v2.safetensors +3 -0
xtts2_config.py +228 -0
xtts2_modeling.py +940 -129
xttsv2_gpt2/config.json +44 -0
xttsv2_gpt2/gpt2_model.safetensors +3 -0
xttsv2_gpt2/gpt_config.py +143 -0
xttsv2_gpt2/special_tokens_map.json +6 -0
xttsv2_gpt2/tokenizer.json +0 -0
xttsv2_gpt2/tokenizer.py +887 -0
xttsv2_gpt2/tokenizer_config.json +192 -0
xttsv2_gpt2/xtts2_gpt_modeling.py +505 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ xtts-v2.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ xttsv2_gpt2/gpt2_model.safetensors filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+  "architectures": [
+    "Xtts"
+  ],
+  "audio_config": {
+    "fmax": 8000,
+    "fmin": 0,
+    "hop_length": 256,
+    "mel_channels": 80,
+    "mel_norms_file": null,
+    "n_fft": 1024,
+    "output_sample_rate": 24000,
+    "power": 1.0,
+    "sample_rate": 22050,
+    "win_length": 1024
+  },
+  "auto_map": {
+    "AutoConfig": "AstraMindAI/xtts2--xtts2_config.XTTSConfig",
+    "AutoModelForCausalLM": "AstraMindAI/xtts2--xtts2_modeling.Xtts",
+    "AutoTokenizer": "AstraMindAI/xtts2--tokenizer.XTTSTokenizerFast"
+  },
+  "cond_d_vector_in_each_upsampling_layer": true,
+  "d_vector_dim": 512,
+  "decoder_input_dim": 1024,
+  "duration_const": 102400,
+  "gpt": {
+    "model_type": "xtts_gpt"
+  },
+  "gpt_code_stride_len": 1024,
+  "gpt_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "activation_function": "gelu",
+    "add_cross_attention": false,
+    "architectures": [
+      "XttsGPT"
+    ],
+    "attn_pdrop": 0.1,
+    "audio_config": {
+      "mel_channels": 80,
+      "output_sample_rate": 24000,
+      "sample_rate": 22050
+    },
+    "auto_map": {
+      "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+      "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_input_dim": 1024,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable_redaction": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gpt_batch_size": 1,
+    "gpt_max_audio_tokens": 605,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "kv_cache": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-05,
+    "length_penalty": 1.0,
+    "max_audio_tokens": 605,
+    "max_length": 20,
+    "max_prompt_tokens": 70,
+    "max_text_tokens": 402,
+    "min_length": 0,
+    "model_type": "xtts_gpt",
+    "n_inner": 4096,
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_audio_tokens": 1026,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 30,
+    "num_return_sequences": 1,
+    "number_text_tokens": 6681,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "reorder_and_upcast_attn": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_attn_by_inverse_layer_idx": false,
+    "sep_token_id": null,
+    "start_audio_token": 1024,
+    "start_text_token": null,
+    "stop_audio_token": 1025,
+    "stop_text_token": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.46.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_masking_gt_prompt_approach": true,
+    "use_perceiver_resampler": true,
+    "vocab_size": 6681
+  },
+  "input_sample_rate": 22050,
+  "languages": [
+    "en",
+    "es",
+    "fr",
+    "de",
+    "it",
+    "pt",
+    "pl",
+    "tr",
+    "ru",
+    "nl",
+    "cs",
+    "ar",
+    "zh-cn",
+    "hu",
+    "ko",
+    "ja",
+    "hi"
+  ],
+  "model_type": "xtts",
+  "num_chars": 255,
+  "output_hop_length": 256,
+  "output_sample_rate": 24000,
+  "tokenizer_file": "",
+  "transformers_version": "4.46.0"
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,836 @@

+import os
+import re
+import textwrap
+from typing import List, Optional, Union, Dict, Any
+from functools import cached_property
+import pypinyin
+import torch
+from hangul_romanize import Transliter
+from hangul_romanize.rule import academic
+from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
+from transformers import PreTrainedTokenizerFast, BatchEncoding
+from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
+from tokenizers import Tokenizer
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from tokenizers.processors import TemplateProcessing
+from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
+import cutlet
+# Funzioni di preprocessing del testo
+def get_spacy_lang(lang):
+    if lang == "zh":
+        return Chinese()
+    elif lang == "ja":
+        return Japanese()
+    elif lang == "ar":
+        return Arabic()
+    elif lang == "es":
+        return Spanish()
+    else:
+        # For most languages, English does the job
+        return English()
+def split_sentence(text, lang, text_split_length=250):
+    """Preprocess the input text and split into sentences based on language."""
+    text_splits = []
+    if text_split_length is not None and len(text) >= text_split_length:
+        text_splits.append("")
+        nlp = get_spacy_lang(lang)
+        nlp.add_pipe("sentencizer")
+        doc = nlp(text)
+        for sentence in doc.sents:
+            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
+                text_splits[-1] += " " + str(sentence)
+                text_splits[-1] = text_splits[-1].lstrip()
+            elif len(str(sentence)) > text_split_length:
+                for line in textwrap.wrap(
+                    str(sentence),
+                    width=text_split_length,
+                    drop_whitespace=True,
+                    break_on_hyphens=False,
+                    tabsize=1,
+                ):
+                    text_splits.append(str(line))
+            else:
+                text_splits.append(str(sentence))
+        if len(text_splits) > 1 and text_splits[0] == "":
+                del text_splits[0]
+    else:
+        text_splits = [text.lstrip()]
+    return text_splits
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = {
+    "en": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mrs", "misess"),
+            ("mr", "mister"),
+            ("dr", "doctor"),
+            ("st", "saint"),
+            ("co", "company"),
+            ("jr", "junior"),
+            ("maj", "major"),
+            ("gen", "general"),
+            ("drs", "doctors"),
+            ("rev", "reverend"),
+            ("lt", "lieutenant"),
+            ("hon", "honorable"),
+            ("sgt", "sergeant"),
+            ("capt", "captain"),
+            ("esq", "esquire"),
+            ("ltd", "limited"),
+            ("col", "colonel"),
+            ("ft", "fort"),
+        ]
+    ],
+    "es": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "señora"),
+            ("sr", "señor"),
+            ("dr", "doctor"),
+            ("dra", "doctora"),
+            ("st", "santo"),
+            ("co", "compañía"),
+            ("jr", "junior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "fr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mme", "madame"),
+            ("mr", "monsieur"),
+            ("dr", "docteur"),
+            ("st", "saint"),
+            ("co", "compagnie"),
+            ("jr", "junior"),
+            ("ltd", "limitée"),
+        ]
+    ],
+    "de": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("fr", "frau"),
+            ("dr", "doktor"),
+            ("st", "sankt"),
+            ("co", "firma"),
+            ("jr", "junior"),
+        ]
+    ],
+    "pt": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "senhora"),
+            ("sr", "senhor"),
+            ("dr", "doutor"),
+            ("dra", "doutora"),
+            ("st", "santo"),
+            ("co", "companhia"),
+            ("jr", "júnior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "it": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # ("sig.ra", "signora"),
+            ("sig", "signore"),
+            ("dr", "dottore"),
+            ("st", "santo"),
+            ("co", "compagnia"),
+            ("jr", "junior"),
+            ("ltd", "limitata"),
+        ]
+    ],
+    "pl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("p", "pani"),
+            ("m", "pan"),
+            ("dr", "doktor"),
+            ("sw", "święty"),
+            ("jr", "junior"),
+        ]
+    ],
+    "ar": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # There are not many common abbreviations in Arabic as in English.
+        ]
+    ],
+    "zh": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+    "cs": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("ing", "inženýr"),  # engineer
+            ("p", "pan"),  # Could also map to pani for woman but no easy way to do it
+            # Other abbreviations would be specialized and not as common.
+        ]
+    ],
+    "ru": [
+        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("г-жа", "госпожа"),  # Mrs.
+            ("г-н", "господин"),  # Mr.
+            ("д-р", "доктор"),  # doctor
+            # Other abbreviations are less common or specialized.
+        ]
+    ],
+    "nl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dhr", "de heer"),  # Mr.
+            ("mevr", "mevrouw"),  # Mrs.
+            ("dr", "dokter"),  # doctor
+            ("jhr", "jonkheer"),  # young lord or nobleman
+            # Dutch uses more abbreviations, but these are the most common ones.
+        ]
+    ],
+    "tr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("b", "bay"),  # Mr.
+            ("byk", "büyük"),  # büyük
+            ("dr", "doktor"),  # doctor
+            # Add other Turkish abbreviations here if needed.
+        ]
+    ],
+    "hu": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("b", "bácsi"),  # Mr.
+            ("nőv", "nővér"),  # nurse
+            # Add other Hungarian abbreviations here if needed.
+        ]
+    ],
+    "ko": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+}
+def expand_abbreviations_multilingual(text, lang="en"):
+    if lang in _abbreviations:
+        for regex, replacement in _abbreviations[lang]:
+            text = re.sub(regex, replacement, text)
+    return text
+_symbols_multilingual = {
+    "en": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " and "),
+            ("@", " at "),
+            ("%", " percent "),
+            ("#", " hash "),
+            ("$", " dollar "),
+            ("£", " pound "),
+            ("°", " degree "),
+        ]
+    ],
+    "es": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " y "),
+            ("@", " arroba "),
+            ("%", " por ciento "),
+            ("#", " numeral "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " grados "),
+        ]
+    ],
+    "fr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " et "),
+            ("@", " arobase "),
+            ("%", " pour cent "),
+            ("#", " dièse "),
+            ("$", " dollar "),
+            ("£", " livre "),
+            ("°", " degrés "),
+        ]
+    ],
+    "de": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " und "),
+            ("@", " at "),
+            ("%", " prozent "),
+            ("#", " raute "),
+            ("$", " dollar "),
+            ("£", " pfund "),
+            ("°", " grad "),
+        ]
+    ],
+    "pt": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " arroba "),
+            ("%", " por cento "),
+            ("#", " cardinal "),
+            ("$", " dólar "),
+            ("£", " libra "),
+            ("°", " graus "),
+        ]
+    ],
+    "it": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " chiocciola "),
+            ("%", " per cento "),
+            ("#", " cancelletto "),
+            ("$", " dollaro "),
+            ("£", " sterlina "),
+            ("°", " gradi "),
+        ]
+    ],
+    "pl": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " i "),
+            ("@", " małpa "),
+            ("%", " procent "),
+            ("#", " krzyżyk "),
+            ("$", " dolar "),
+            ("£", " funt "),
+            ("°", " stopnie "),
+        ]
+    ],
+    "ar": [
+        # Arabic
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " و "),
+            ("@", " على "),
+            ("%", " في المئة "),
+            ("#", " رقم "),
+            ("$", " دولار "),
+            ("£", " جنيه "),
+            ("°", " درجة "),
+        ]
+    ],
+    "zh": [
+        # Chinese
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 和 "),
+            ("@", " 在 "),
+            ("%", " 百分之 "),
+            ("#", " 号 "),
+            ("$", " 美元 "),
+            ("£", " 英镑 "),
+            ("°", " 度 "),
+        ]
+    ],
+    "cs": [
+        # Czech
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " a "),
+            ("@", " na "),
+            ("%", " procento "),
+            ("#", " křížek "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " stupně "),
+        ]
+    ],
+    "ru": [
+        # Russian
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " и "),
+            ("@", " собака "),
+            ("%", " процентов "),
+            ("#", " номер "),
+            ("$", " доллар "),
+            ("£", " фунт "),
+            ("°", " градус "),
+        ]
+    ],
+    "nl": [
+        # Dutch
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " en "),
+            ("@", " bij "),
+            ("%", " procent "),
+            ("#", " hekje "),
+            ("$", " dollar "),
+            ("£", " pond "),
+            ("°", " graden "),
+        ]
+    ],
+    "tr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " ve "),
+            ("@", " at "),
+            ("%", " yüzde "),
+            ("#", " diyez "),
+            ("$", " dolar "),
+            ("£", " sterlin "),
+            ("°", " derece "),
+        ]
+    ],
+    "hu": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " és "),
+            ("@", " kukac "),
+            ("%", " százalék "),
+            ("#", " kettőskereszt "),
+            ("$", " dollár "),
+            ("£", " font "),
+            ("°", " fok "),
+        ]
+    ],
+    "ko": [
+        # Korean
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 그리고 "),
+            ("@", " 에 "),
+            ("%", " 퍼센트 "),
+            ("#", " 번호 "),
+            ("$", " 달러 "),
+            ("£", " 파운드 "),
+            ("°", " 도 "),
+        ]
+    ],
+}
+def expand_symbols_multilingual(text, lang="en"):
+    if lang in _symbols_multilingual:
+        for regex, replacement in _symbols_multilingual[lang]:
+            text = re.sub(regex, replacement, text)
+            text = text.replace("  ", " ")  # Ensure there are no double spaces
+    return text.strip()
+_ordinal_re = {
+    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
+    "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
+    "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
+    "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
+    "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
+    "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
+    "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
+    "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
+    "cs": re.compile(r"([0-9]+)\.(?=\s|$)"),  # In Czech, a dot is often used after the number to indicate ordinals.
+    "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
+    "nl": re.compile(r"([0-9]+)(de|ste|e)"),
+    "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
+    "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
+    "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
+}
+_number_re = re.compile(r"[0-9]+")
+_currency_re = {
+    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
+    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
+    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
+}
+_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
+_dot_number_re = re.compile(r"\b\d{1,3}(\.\d{3})*(\,\d+)?\b")
+_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
+def _remove_commas(m):
+    text = m.group(0)
+    if "," in text:
+        text = text.replace(",", "")
+    return text
+def _remove_dots(m):
+    text = m.group(0)
+    if "." in text:
+        text = text.replace(".", "")
+    return text
+def _expand_decimal_point(m, lang="en"):
+    amount = m.group(1).replace(",", ".")
+    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+def _expand_currency(m, lang="en", currency="USD"):
+    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+    and_equivalents = {
+        "en": ", ",
+        "es": " con ",
+        "fr": " et ",
+        "de": " und ",
+        "pt": " e ",
+        "it": " e ",
+        "pl": ", ",
+        "cs": ", ",
+        "ru": ", ",
+        "nl": ", ",
+        "ar": ", ",
+        "tr": ", ",
+        "hu": ", ",
+        "ko": ", ",
+    }
+    if amount.is_integer():
+        last_and = full_amount.rfind(and_equivalents.get(lang, ", "))
+        if last_and != -1:
+            full_amount = full_amount[:last_and]
+    return full_amount
+def _expand_ordinal(m, lang="en"):
+    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+def _expand_number(m, lang="en"):
+    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+def expand_numbers_multilingual(text, lang="en"):
+    if lang == "zh":
+        text = zh_num2words()(text)
+    else:
+        if lang in ["en", "ru"]:
+            text = re.sub(_comma_number_re, _remove_commas, text)
+        else:
+            text = re.sub(_dot_number_re, _remove_dots, text)
+        try:
+            text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
+            text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
+            text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
+        except Exception as e:
+            pass
+        if lang != "tr":
+            text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
+        if lang in _ordinal_re:
+            text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
+        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def multilingual_cleaners(text, lang):
+    text = text.replace('"', "")
+    if lang == "tr":
+        text = text.replace("İ", "i")
+        text = text.replace("Ö", "ö")
+        text = text.replace("Ü", "ü")
+    text = lowercase(text)
+    text = expand_numbers_multilingual(text, lang)
+    text = expand_abbreviations_multilingual(text, lang)
+    text = expand_symbols_multilingual(text, lang=lang)
+    text = collapse_whitespace(text)
+    return text
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def chinese_transliterate(text):
+    return "".join(
+        [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
+    )
+def japanese_cleaners(text, katsu):
+    text = katsu.romaji(text)
+    text = lowercase(text)
+    return text
+def korean_transliterate(text, transliter):
+    return transliter.translit(text)
+# Fast Tokenizer Class
+class XTTSTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
+    """
+    def __init__(
+            self,
+            vocab_file: str = None,
+            tokenizer_object: Optional[Tokenizer] = None,
+            unk_token: str = "[UNK]",
+            pad_token: str = "[PAD]",
+            bos_token: str = "[START]",
+            eos_token: str = "[STOP]",
+            auto_map: dict = {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", None]},
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ):
+        if tokenizer_object is None and vocab_file is not None:
+            tokenizer_object = Tokenizer.from_file(vocab_file)
+        if tokenizer_object is not None:
+            # Configure the tokenizer
+            tokenizer_object.pre_tokenizer = WhitespaceSplit()
+            tokenizer_object.post_processor = TemplateProcessing(
+                single=f"{bos_token} $A {eos_token}",
+                special_tokens=[
+                    (bos_token, tokenizer_object.token_to_id(bos_token)),
+                    (eos_token, tokenizer_object.token_to_id(eos_token)),
+                ],
+            )
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs
+        )
+        # Character limits per language
+        self.char_limits = {
+            "en": 250, "de": 253, "fr": 273, "es": 239,
+            "it": 213, "pt": 203, "pl": 224, "zh": 82,
+            "ar": 166, "cs": 186, "ru": 182, "nl": 251,
+            "tr": 226, "ja": 71, "hu": 224, "ko": 95,
+        }
+        # Initialize language tools
+        self._katsu = None
+        self._korean_transliter = Transliter(academic)
+        # Ensure pad_token_id is set
+        if self.pad_token_id is None:
+            self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
+    @cached_property
+    def katsu(self):
+        if self._katsu is None:
+            self._katsu = cutlet.Cutlet()
+        return self._katsu
+    def preprocess_text(self, text: str, lang: str) -> str:
+        """Apply text preprocessing for language"""
+        base_lang = lang.split("-")[0]  # remove region
+        if base_lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
+                         "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+            text = multilingual_cleaners(text, base_lang)
+            if base_lang == "zh":
+                text = chinese_transliterate(text)
+            if base_lang == "ko":
+                text = korean_transliterate(text, self._korean_transliter)
+        elif base_lang == "ja":
+            text = japanese_cleaners(text, self.katsu)
+        else:
+            text = basic_cleaners(text)
+        return text
+    def batch_encode_with_split(self, texts: Union[str, List[str]], lang: Union[str, List[str]],
+                                **kwargs) -> torch.Tensor:
+        """
+        Split texts into smaller chunks based on language character limits and encode them using HuggingFace fast tokenizer.
+        strictly mimic the xttsv2 tokenizer
+        """
+        # Convert single inputs to lists
+        if isinstance(texts, str):
+            texts = [texts]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(texts) > 1:
+            lang = lang * len(texts)
+        # Check if texts and lang have the same length
+        if len(texts) != len(lang):
+            raise ValueError(f"Number of texts ({len(texts)}) does not match number of languages ({len(lang)}).")
+        chunk_list = []
+        max_splits = 0
+        # For each text, split into chunks based on character limit
+        for text, text_lang in zip(texts, lang):
+            # Get language character limit
+            base_lang = text_lang.split("-")[0]
+            char_limit = self.char_limits.get(base_lang, 250)
+            # Clean and preprocess
+            text = self.preprocess_text(text, text_lang)
+            # Split text into sentences/chunks based on language
+            chunk_list = split_sentence(text, base_lang, text_split_length=char_limit)
+        # Ensure the tokenizer is a fast tokenizer
+        if not self.is_fast:
+            raise ValueError("The tokenizer must be a fast tokenizer.")
+        # Encode all chunks using the fast tokenizer
+        encoding: BatchEncoding = self(
+            chunk_list,
+            lang = lang,
+            add_special_tokens=False,
+            padding=False,
+            **kwargs
+        )
+        # The 'input_ids' tensor will have shape [total_chunks, max_sequence_length]
+        return encoding['input_ids']  # Tensor of shape [total_chunks, sequence_length]
+    def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs,
+            add_special_tokens: bool = True,
+            padding_strategy=PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Override batch encoding to handle language-specific preprocessing
+        """
+        lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(batch_text_or_text_pairs) > 1:
+            lang = lang * len(batch_text_or_text_pairs)
+        # Check if batch_text_or_text_pairs and lang have the same length
+        if len(batch_text_or_text_pairs) != len(lang):
+            raise ValueError(f"Number of texts ({len(batch_text_or_text_pairs)}) does not match number of languages ({len(lang)}).")
+        # Preprocess each text in the batch with its corresponding language
+        processed_texts = []
+        for text, text_lang in zip(batch_text_or_text_pairs, lang):
+            if isinstance(text, str):
+                # Check length and preprocess
+                #self.check_input_length(text, text_lang)
+                processed_text = self.preprocess_text(text, text_lang)
+                # Format text with language tag and spaces
+                base_lang = text_lang.split("-")[0]
+                lang_code = "zh-cn" if base_lang == "zh" else base_lang
+                processed_text = f"[{lang_code}]{processed_text}"
+                processed_text = processed_text.replace(" ", "[SPACE]")
+                processed_texts.append(processed_text)
+            else:
+                processed_texts.append(text)
+        # Call the parent class's encoding method with processed texts
+        return super()._batch_encode_plus(
+            processed_texts,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs
+        )
+    def __call__(
+            self,
+            text: Union[str, List[str]],
+            lang: Union[str, List[str]] = "en",
+            add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = False,
+            truncation: Union[bool, str, TruncationStrategy] = False,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = True,
+            **kwargs
+    ):
+        """
+        Main tokenization method
+        """
+        # Convert single string to list for batch processing
+        if isinstance(text, str):
+            text = [text]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(text) > 1:
+            lang = lang * len(text)
+        # Ensure text and lang lists have same length
+        if len(text) != len(lang):
+            raise ValueError(f"Number of texts ({len(text)}) does not match number of languages ({len(lang)}).")
+        # Convert padding strategy
+        if isinstance(padding, bool):
+            padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
+        else:
+            padding_strategy = PaddingStrategy(padding)
+        # Convert truncation strategy
+        if isinstance(truncation, bool):
+            truncation_strategy = TruncationStrategy.LONGEST_FIRST if truncation else TruncationStrategy.DO_NOT_TRUNCATE
+        else:
+            truncation_strategy = TruncationStrategy(truncation)
+        # Use the batch encoding method
+        encoded = self._batch_encode_plus(
+            text,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            lang=lang,
+            **kwargs
+        )
+        return encoded

xtts-v2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3942a405803e6148140be867c3b9d2b601aff053b042e5e933ca89e49371072
+size 345226804

xtts2_config.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from dataclasses import asdict, dataclass
+from typing import Dict, Optional, List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+@dataclass
+class GPTAudioConfig:
+    """Configuration for GPT audio processing parameters"""
+    mel_channels: int = 80
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+@dataclass
+class XTTSAudioConfig:
+    """Configuration for audio processing parameters"""
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+    mel_channels: int = 80
+    hop_length: int = 256
+    win_length: int = 1024
+    n_fft: int = 1024
+    fmin: int = 0
+    fmax: int = 8000
+    power: float = 1.0
+    mel_norms_file: Optional[str] = None
+class XTTSGPTConfig(PretrainedConfig):
+    """Configuration class for the GPT component of XTTS."""
+    model_type = "xtts_gpt"
+    def __init__(
+            self,
+            # Model architecture
+            hidden_size: int = 1024,  # gpt_n_model_channels in original
+            n_inner: int = 4096,
+            num_hidden_layers: int = 30,  # gpt_layers in original
+            num_attention_heads: int = 16,  # gpt_n_heads in original
+            # Tokenizer settings
+            vocab_size: int = 6681,  # gpt_number_text_tokens in original
+            number_text_tokens: int = 6681,  # Explicit text token vocabulary size
+            start_text_token: Optional[int] = None,
+            stop_text_token: Optional[int] = None,
+            # Audio token settings
+            num_audio_tokens: int = 1026,  # gpt_num_audio_tokens in original
+            start_audio_token: int = 1024,  # gpt_start_audio_token in original
+            stop_audio_token: int = 1025,  # gpt_stop_audio_token in original
+            # Sequence length settings
+            max_audio_tokens: int = 605,  # gpt_max_audio_tokens in original
+            max_text_tokens: int = 402,  # gpt_max_text_tokens in original
+            max_prompt_tokens: int = 70,  # gpt_max_prompt_tokens in original
+            gpt_max_audio_tokens: int = 605,  # Used for generation
+            # Model behavior settings
+            use_masking_gt_prompt_approach: bool = True,  # gpt_use_masking_gt_prompt_approach in original
+            use_perceiver_resampler: bool = True,  # gpt_use_perceiver_resampler in original
+            kv_cache: bool = True,
+            enable_redaction: bool = False,
+            # GPT batch settings
+            gpt_batch_size: int = 1,
+            # Audio processing
+            audio_config: Optional[Dict] = None,
+            # Architecture specifics
+            layer_norm_epsilon: float = 1e-5,
+            initializer_range: float = 0.02,
+            add_cross_attention: bool = False,
+            scale_attn_by_inverse_layer_idx: bool = False,
+            reorder_and_upcast_attn: bool = False,
+            # Size settings for the decoder
+            decoder_input_dim: int = 1024,
+            architectures=["XttsGPT"],
+            auto_map={
+                "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+                "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
+            },
+            activation_function: str = "gelu",
+            attn_pdrop: float = 0.1,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.auto_map = auto_map
+        self.audio_config = GPTAudioConfig(
+            **audio_config if audio_config is not None else {}
+        )
+        self.activation_function = activation_function
+        self.attn_pdrop = attn_pdrop
+        self.hidden_size = hidden_size
+        self.n_inner = n_inner
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.vocab_size = vocab_size
+        self.number_text_tokens = number_text_tokens
+        self.start_text_token = start_text_token
+        self.stop_text_token = stop_text_token
+        self.num_audio_tokens = num_audio_tokens
+        self.start_audio_token = start_audio_token
+        self.stop_audio_token = stop_audio_token
+        self.max_audio_tokens = max_audio_tokens
+        self.max_text_tokens = max_text_tokens
+        self.max_prompt_tokens = max_prompt_tokens
+        self.gpt_max_audio_tokens = gpt_max_audio_tokens
+        self.use_masking_gt_prompt_approach = use_masking_gt_prompt_approach
+        self.use_perceiver_resampler = use_perceiver_resampler
+        self.kv_cache = kv_cache
+        self.enable_redaction = enable_redaction
+        self.gpt_batch_size = gpt_batch_size
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.add_cross_attention = add_cross_attention
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.decoder_input_dim = decoder_input_dim
+    def to_dict(self) -> Dict:
+        """Convert the config to a dictionary."""
+        output = super().to_dict()
+        output["audio_config"] = asdict(self.audio_config)
+        return output
+    @classmethod
+    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSGPTConfig":
+        """Create a config from a dictionary."""
+        return cls(**config_dict)
+class XTTSConfig(PretrainedConfig):
+    """Configuration class for XTTS model components except GPT."""
+    model_type = "xtts"
+    def __init__(
+            self,
+            # Audio settings
+            audio_config: Optional[Dict] = None,
+            input_sample_rate: int = 22050,
+            output_sample_rate: int = 24000,
+            output_hop_length: int = 256,
+            # Model architecture
+            decoder_input_dim: int = 1024,
+            d_vector_dim: int = 512,
+            cond_d_vector_in_each_upsampling_layer: bool = True,
+            # Training settings
+            gpt_code_stride_len: int = 1024,
+            duration_const: int = 102400,
+            # Tokenizer settings
+            tokenizer_file: str = "",
+            num_chars: int = 255,
+            # Language support
+            languages: Optional[List[str]] = None,
+            # GPT configuration
+            gpt_config: Optional[Dict] = None,
+            architectures=["Xtts"],
+            auto_map = {
+                       "AutoConfig": "AstraMindAI/xtts2--xtts2_config.XTTSConfig",
+                       "AutoModelForCausalLM": "AstraMindAI/xtts2--xtts2_modeling.Xtts",
+                   },
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.auto_map = auto_map
+        # Initialize audio config
+        self.audio_config = XTTSAudioConfig(
+            **audio_config if audio_config is not None else {}
+        )
+        self.input_sample_rate = input_sample_rate
+        self.output_sample_rate = output_sample_rate
+        self.output_hop_length = output_hop_length
+        self.decoder_input_dim = decoder_input_dim
+        self.d_vector_dim = d_vector_dim
+        self.cond_d_vector_in_each_upsampling_layer = cond_d_vector_in_each_upsampling_layer
+        self.gpt_code_stride_len = gpt_code_stride_len
+        self.duration_const = duration_const
+        self.tokenizer_file = tokenizer_file
+        self.num_chars = num_chars
+        # Initialize GPT config
+        self.gpt = XTTSGPTConfig(**gpt_config if gpt_config is not None else {})
+        if languages is None:
+            self.languages = [
+                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru",
+                "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
+            ]
+        else:
+            self.languages = languages
+    def to_dict(self) -> Dict:
+        """Convert the config to a dictionary."""
+        output = super().to_dict()
+        output["audio_config"] = asdict(self.audio_config)
+        output["gpt_config"] = self.gpt.to_dict()
+        return output
+    @classmethod
+    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSConfig":
+        """Create a config from a dictionary."""
+        if "gpt_config" in config_dict:
+            gpt_config = config_dict["gpt_config"]
+            config_dict = {k: v for k, v in config_dict.items() if k != "gpt_config"}
+            return cls(gpt_config=gpt_config, **config_dict)
+        return cls(**config_dict)

xtts2_modeling.py CHANGED Viewed

@@ -1,81 +1,433 @@
 import asyncio
 from dataclasses import dataclass
-from typing import Optional, List, Tuple
 from concurrent.futures import ThreadPoolExecutor
 import torch
 import numpy as np
-from transformers import PreTrainedModel
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, TokensPrompt
 from vllm.multimodal import MultiModalDataDict
 from vllm.utils import Counter
 from TTS.TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
-from gpt_config import XTTSGPTConfig
-from xtts2_config import XTTSConfig
-from tokenizer import XTTSTokenizerFast
 @dataclass
 class XTTSRequest:
     """Container for XTTS inference request data"""
     request_id: str
-    text: str
     language: str
-    gpt_cond_latent: torch.Tensor
-    speaker_embedding: torch.Tensor
     temperature: float = 0.75
     top_p: float = 0.85
     top_k: int = 50
-    repetition_penalty: float = 10.0
     length_penalty: float = 1.0
     do_sample: bool = True
 @dataclass
 class XTTSOutput:
-    """Container for XTTS inference output"""
     request_id: str
     wav: np.ndarray
-    gpt_latents: np.ndarray
-    speaker_embedding: torch.Tensor
-class Xtts(PreTrainedModel):
     """Async XTTS model implementation using VLLM's AsyncEngine."""
     def __init__(self, hifi_config: XTTSConfig, gpt_config: XTTSGPTConfig, tensor_parallel_size: int = 1, **kwargs):
         self.hifi_config = hifi_config
         self.gpt_config = gpt_config
         self.tp = tensor_parallel_size
         self.tokenizer = XTTSTokenizerFast.from_pretrained("AstraMindAI/xtts2-gpt")
         self.request_counter = Counter()
         self.executor = ThreadPoolExecutor(max_workers=4)  # For CPU-bound tasks
-        self.init_models()
         self.register_buffer("mel_stats", torch.ones(80))
-    @staticmethod
-    def get_memory_percentage(memory: int) -> float:
-        """Get memory percentage."""
-        return memory / torch.cuda.get_device_properties(0).total_memory
-    async def init_models(self):
-        """Initialize models with AsyncVLLMEngine."""
-        # Initialize VLLM engine
-        engine_args = AsyncEngineArgs(
-            model=self.gpt_config.model_dir,
-            tensor_parallel_size=self.tp,
-            dtype="auto ",
-            max_model_len=self.gpt_config.gpt_max_text_tokens + self.gpt_config.gpt_max_audio_tokens,
-            gpu_memory_utilization=self.get_memory_percentage(2),# since the model neds 2 gb we need to calc the bare minimum memory
-            trust_remote_code=True,
-            skip_tokenizer_init=True, # no need to initialize tokenizer, we use our own
-            max_num_batched_tokens=4096,
-            max_num_seqs=256,
         )
-        self.llm_engine = AsyncLLMEngine.from_engine_args(engine_args)
-        self.llm_engine = AsyncLLMEngine
         # Initialize HiFi-GAN decoder
         self.hifigan_decoder = HifiDecoder(
             input_sample_rate=self.hifi_config.input_sample_rate,
@@ -87,26 +439,78 @@ class Xtts(PreTrainedModel):
             cond_d_vector_in_each_upsampling_layer=self.hifi_config.cond_d_vector_in_each_upsampling_layer,
         )
     @classmethod
     def from_pretrained(
             cls,
             pretrained_model_name_or_path: str,
-            torch_dtype: torch.dtype = torch.float16,
             device_map: Optional[str] = "auto",
             tensor_parallel_size: int = 1,
             **kwargs,
     ) -> "Xtts":
-        """Load pretrained XTTS model from HuggingFace Hub.
-        Args:
-            pretrained_model_name_or_path (str): Path to pretrained weights or HF Hub model id
-            torch_dtype (torch.dtype, optional): Type to load the model as. Defaults to float16.
-            device_map (str, optional): Device mapping strategy. Defaults to "auto".
-            **kwargs: Additional arguments passed to the model.
-        Returns:
-            Xtts: Loaded model instance
-        """
         from huggingface_hub import hf_hub_download
         import json
         import os
@@ -115,32 +519,18 @@ class Xtts(PreTrainedModel):
         if not os.path.exists(pretrained_model_name_or_path):
             config_file = hf_hub_download(
                 repo_id=pretrained_model_name_or_path,
-                filename="../xtts2_gpt/config.json"
             )
             with open(config_file, 'r') as f:
                 config = json.load(f)
-            gpt_config_file = hf_hub_download(
-                repo_id=pretrained_model_name_or_path,
-                filename="gpt_config.py"
-            )
-            with open(gpt_config_file, 'r') as f:
-                gpt_config = json.loads(f.read())
-            hifigan_config_file = hf_hub_download(
-                repo_id=pretrained_model_name_or_path,
-                filename="xtts2_config.py"
-            )
-            with open(hifigan_config_file, 'r') as f:
-                hifigan_config = json.loads(f.read())
         else:
             # Load from local path
             with open(os.path.join(pretrained_model_name_or_path, "config.json"), 'r') as f:
                 config = json.load(f)
         # Initialize configs
-        gpt_config = XTTSGPTConfig(**config)
         hifi_config = XTTSConfig(**config)
         # Initialize model
@@ -153,107 +543,528 @@ class Xtts(PreTrainedModel):
         # Load model weights
         if not os.path.exists(pretrained_model_name_or_path):
-            gpt_weights = hf_hub_download(
-                repo_id=pretrained_model_name_or_path,
-                filename="../xtts2_gpt/xttsv2-gpt.safetensors"
-            )
             hifigan_weights = hf_hub_download(
                 repo_id=pretrained_model_name_or_path,
-                filename="xttsv2-hifigan-mel.safetensors"
             )
         else:
-            gpt_weights = os.path.join(pretrained_model_name_or_path, "xttsv2-gpt.safetensors")
-            hifigan_weights = os.path.join(pretrained_model_name_or_path, "xttsv2-hifigan-mel.safetensors")
-        # Load GPT weights
         import safetensors.torch
-        state_dict = safetensors.torch.load_file(gpt_weights)
-        model.gpt.load_state_dict(state_dict)
         # Load HiFi-GAN weights
         hifigan_state = safetensors.torch.load_file(hifigan_weights)
-        model.hifigan_decoder.load_state_dict(hifigan_state)
         # Set model properties
         model.config = config
         # Cast model to specified dtype
         model = model.to(torch_dtype)
-        # Handle device mapping
-        if device_map:
-            from accelerate import dispatch_model
-            model = dispatch_model(model, device_map=device_map)
         return model
-    def prepare_inputs(self, text: str, language: str, gpt_cond_latent: torch.Tensor) -> Tuple[List[int], torch.Tensor]:
-        """Prepare input text with conditioning tokens."""
-        # Add special tokens and conditioning format
-        # Format: <|condition|>latent_data<|endofcondition|>text<|endoftext|>
-        text_tokens = self.tokenizer.encode(text, lang=language)
-        return text_tokens, gpt_cond_latent
-    async def generate_speech_async(self, request: XTTSRequest) -> XTTSOutput:
-        """Generate speech for a single request asynchronously."""
-        # Prepare input with conditioning
-        tokens, gpt_cond_latent = self.prepare_inputs(
-            request.text,
-            request.language,
-            request.gpt_cond_latent
         )
-        # Setup sampling parameters
         sampling_params = SamplingParams(
             temperature=request.temperature,
             top_p=request.top_p,
             top_k=request.top_k,
-            repetition_penalty=request.repetition_penalty,
             max_tokens=self.gpt_config.gpt_max_audio_tokens,
-            stop=['</s>', '<|endoftext|>']
-        )
-        engine_inputs = TokensPrompt( prompt_token_ids = tokens )
-        if gpt_cond_latent is not None:
-            engine_inputs["multi_modal_data"] = MultiModalDataDict({"audio":gpt_cond_latent})
-        # Generate tokens using VLLM
-        output_generator = self.llm_engine.generate(
-            inputs=engine_inputs,
-            sampling_params=sampling_params,
-            request_id=request.request_id
         )
-        async for outputs in output_generator:
-            # Extract generated tokens
-            generated_tokens = outputs.outputs[0].token_ids
-            # Convert to hidden states (this step depends on your model architecture)
-            hidden_states = await self._tokens_to_hidden_states(generated_tokens)
-            # Generate audio using HiFi-GAN (run in thread pool to avoid blocking)
-            wav = await asyncio.get_event_loop().run_in_executor(
-                self.executor,
-                lambda: self.hifigan_decoder(
-                    hidden_states,
-                    g=request.speaker_embedding
-                ).cpu().numpy().squeeze()
             )
-            return XTTSOutput(
-                request_id=request.request_id,
-                wav=wav,
-                gpt_latents=hidden_states.cpu().numpy(),
-                speaker_embedding=request.speaker_embedding
             )
-    async def _tokens_to_hidden_states(self, tokens: List[int]) -> torch.Tensor:
-        """Convert generated tokens to hidden states."""
-        # This implementation depends on your specific model architecture
-        # You'll need to adapt this based on how your model processes tokens
-        # This is a placeholder implementation
-        token_tensor = torch.tensor(tokens, device=self.device)
-        # Use VLLM's engine to get hidden states
-        hidden_states = await self.llm_engine.encode(token_tensor)
-        return hidden_states

 import asyncio
+import functools
+import logging
+import random
+import time
+import uuid
 from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, List, Tuple, Union, AsyncGenerator, Dict, Any
 from concurrent.futures import ThreadPoolExecutor
+import librosa
 import torch
 import numpy as np
+import torchaudio
+import sounddevice as sd
+import io
+from torch import nn
+from IPython.display import Audio, display
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, TokensPrompt, RequestOutput
 from vllm.multimodal import MultiModalDataDict
 from vllm.utils import Counter
 from TTS.TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
+from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder  # noqa
+from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler  # noqa
+from .xtts2_config import XTTSConfig, XTTSGPTConfig
+from .tokenizer import XTTSTokenizerFast
+from ..xtts2_gpt.xtts2_gpt_modeling import LearnedPositionEmbeddings
+def wav_to_mel_cloning(
+        wav,
+        mel_norms_file="../experiments/clips_mel_norms.pth",
+        mel_norms=None,
+        device=torch.device("cpu"),
+        n_fft=4096,
+        hop_length=1024,
+        win_length=4096,
+        power=2,
+        normalized=False,
+        sample_rate=22050,
+        f_min=0,
+        f_max=8000,
+        n_mels=80,
+):
+    mel_stft = torchaudio.transforms.MelSpectrogram(
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        power=power,
+        normalized=normalized,
+        sample_rate=sample_rate,
+        f_min=f_min,
+        f_max=f_max,
+        n_mels=n_mels,
+        norm="slaney",
+    ).to(device)
+    wav = wav.to(device)
+    mel = mel_stft(wav)
+    mel = torch.log(torch.clamp(mel, min=1e-5))
+    if mel_norms is None:
+        mel_norms = torch.load(mel_norms_file, map_location=device)
+    mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1)
+    return mel
+def load_audio(audiopath, sampling_rate):
+    audio, lsr = torchaudio.load(audiopath)
+    # Stereo to mono if needed
+    if audio.size(0) != 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
+    if lsr != sampling_rate:
+        audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
+    # Clip audio invalid values
+    audio.clip_(-1, 1)
+    return audio
 @dataclass
 class XTTSRequest:
     """Container for XTTS inference request data"""
     request_id: str
+    text: Union[AsyncGenerator[str, None], str]
     language: str
+    speaker_file: str  # Path to the speaker audio file
+    generate_every_n_chars: Optional[int] = None
     temperature: float = 0.75
     top_p: float = 0.85
     top_k: int = 50
+    repetition_penalty: float = 5.0
     length_penalty: float = 1.0
     do_sample: bool = True
+    max_ref_length: int = 60
+    gpt_cond_len: int = 30
+    gpt_cond_chunk_len: int = 4
+import threading
+class HiddenStatesCollector:
+    def __init__(self):
+        self.outputs = {}
+        self.lock = threading.Lock()
+    def __call__(self, outputs: Optional[torch.Tensor], request_id: str):
+        """Save outputs for a specific request"""
+        with self.lock:
+            if request_id not in self.outputs:
+                self.outputs[request_id] = []
+            self.outputs[request_id].append(outputs)
+    def get_hidden_states(self, request_id) -> Optional[torch.Tensor]:
+        with self.lock:
+            outputs = self.outputs.pop(request_id, None)
+        if outputs is not None:
+            outputs = torch.cat(outputs, dim=0)
+        return outputs
+    def bind_to_request(self, request_id: str):
+        def bound_collector(outputs: Optional[torch.Tensor], _request_id: str = None):
+            self(outputs, request_id)
+        return bound_collector
+class ExtendedSamplingParams(SamplingParams, kw_only=True):
+    """Extended sampling parameters that allows additional fields while maintaining compatibility with SamplingParams.
+    This class inherits from SamplingParams and allows adding new required fields
+    without conflicting with the base class's optional fields ordering.
+    """
+    hidden_state_collector: HiddenStatesCollector  # New required field
+class LogitsRepetitionPenalizer:
+    """A logits processor that applies repetition penalty to prevent repetitive text generation."""
+    def __init__(self, repetition_penalty: float):
+        if repetition_penalty < 0:
+            raise ValueError("Repetition penalty must be non-negative")
+        self.repetition_penalty = repetition_penalty
+    def __call__(self, token_ids: List[int], logits: torch.Tensor) -> torch.Tensor:
+        """Apply repetition penalty to the logits based on previous tokens."""
+        # If no repetition penalty or no tokens to check, return original logits
+        if self.repetition_penalty == 1.0 or not token_ids:
+            return logits
+        # Create a mask for the repeated tokens
+        repeated_tokens = torch.tensor(token_ids,
+                                       device=logits.device,
+                                       dtype=torch.long)
+        # Get logits of repeated tokens
+        repeated_logits = logits[repeated_tokens]
+        # Apply penalty: divide positive logits by penalty, multiply negative logits by penalty
+        repeated_logits = torch.where(
+            repeated_logits > 0,
+            repeated_logits / self.repetition_penalty,
+            repeated_logits * self.repetition_penalty
+        )
+        # Update only the logits for repeated tokens
+        logits[repeated_tokens] = repeated_logits
+        return logits
 @dataclass
 class XTTSOutput:
+    """Container for XTTS inference output with integrated audio utilities"""
     request_id: str
     wav: np.ndarray
+    sample_rate: int = 24000
+    def to_tensor(self) -> torch.Tensor:
+        """Convert numpy array to torch tensor"""
+        if isinstance(self.wav, np.ndarray):
+            return torch.from_numpy(self.wav)
+        return self.wav
+    def to_bytes(self, format: str = 'wav', sample_width: int = 2) -> bytes:
+        """Convert audio to bytes format.
+        Args:
+            format: Output format ('wav' or 'raw')
+            sample_width: Bit depth (1, 2, or 4 bytes per sample)
+        Returns:
+            Audio data as bytes
+        """
+        # Convert to tensor if needed
+        wav_tensor = self.to_tensor()
+        # Ensure correct shape (1, N) for torchaudio
+        if wav_tensor.dim() == 1:
+            wav_tensor = wav_tensor.unsqueeze(0)
+        # Normalize to [-1, 1]
+        wav_tensor = torch.clamp(wav_tensor, -1.0, 1.0)
+        if format == 'wav':
+            buffer = io.BytesIO()
+            torchaudio.save(
+                buffer,
+                wav_tensor,
+                self.sample_rate,
+                format="wav",
+                encoding="PCM_S" if sample_width == 2 else "PCM_F",
+                bits_per_sample=sample_width * 8
+            )
+            return buffer.getvalue()
+        elif format == 'raw':
+            # Scale to appropriate range based on sample width
+            if sample_width == 2:  # 16-bit
+                wav_tensor = (wav_tensor * 32767).to(torch.int16)
+            elif sample_width == 4:  # 32-bit
+                wav_tensor = (wav_tensor * 2147483647).to(torch.int32)
+            else:  # 8-bit
+                wav_tensor = (wav_tensor * 127).to(torch.int8)
+            return wav_tensor.cpu().numpy().tobytes()
+        else:
+            raise ValueError(f"Unsupported format: {format}")
+    def save(self,
+             filename: Union[str, Path],
+             sample_rate: Optional[int] = None,
+             format: Optional[str] = None) -> None:
+        """Save audio to file.
+        Args:
+            filename: Output filename
+            sample_rate: Optional new sample rate for resampling
+            format: Optional format override (default: inferred from extension)
+        """
+        wav_tensor = self.to_tensor()
+        if wav_tensor.dim() == 1:
+            wav_tensor = wav_tensor.unsqueeze(0)
+        # Resample if needed
+        if sample_rate and sample_rate != self.sample_rate:
+            wav_tensor = torchaudio.functional.resample(
+                wav_tensor,
+                orig_freq=self.sample_rate,
+                new_freq=sample_rate
+            )
+        else:
+            sample_rate = self.sample_rate
+        torchaudio.save(
+            filename,
+            wav_tensor,
+            sample_rate,
+            format=format
+        )
+    def resample(self, new_sample_rate: int) -> 'XTTSOutput':
+        """Create new XTTSOutput with resampled audio.
+        Args:
+            new_sample_rate: Target sample rate
+        Returns:
+            New XTTSOutput instance with resampled audio
+        """
+        wav_tensor = self.to_tensor()
+        if wav_tensor.dim() == 1:
+            wav_tensor = wav_tensor.unsqueeze(0)
+        resampled = torchaudio.functional.resample(
+            wav_tensor,
+            orig_freq=self.sample_rate,
+            new_freq=new_sample_rate
+        )
+        return XTTSOutput(
+            request_id=self.request_id,
+            wav=resampled.squeeze().numpy(),
+            sample_rate=new_sample_rate
+        )
+    def get_info(self) -> Tuple[int, int, float]:
+        """Get audio information.
+        Returns:
+            Tuple of (number of samples, sample rate, duration in seconds)
+        """
+        n_samples = len(self.wav)
+        duration = n_samples / self.sample_rate
+        return n_samples, self.sample_rate, duration
+    @classmethod
+    def from_tensor(cls, request_id: str, tensor: torch.Tensor, sample_rate: int = 24000) -> 'XTTSOutput':
+        """Create XTTSOutput from torch tensor.
+        Args:
+            request_id: Request identifier
+            tensor: Audio tensor
+            sample_rate: Sample rate of the audio
+        Returns:
+            New XTTSOutput instance
+        """
+        return cls(
+            request_id=request_id,
+            wav=tensor.squeeze().cpu().numpy(),
+            sample_rate=sample_rate
+        )
+    @classmethod
+    def from_file(cls, request_id: str, filename: Union[str, Path]) -> 'XTTSOutput':
+        """Create XTTSOutput from audio file.
+        Args:
+            request_id: Request identifier
+            filename: Path to audio file
+        Returns:
+            New XTTSOutput instance
+        """
+        wav_tensor, sample_rate = torchaudio.load(filename)
+        return cls.from_tensor(request_id, wav_tensor, sample_rate)
+    def play(self) -> None:
+        """Play the audio through the default sound device.
+        For use in regular Python scripts/applications."""
+        # Ensure the audio is in the correct format
+        if isinstance(self.wav, torch.Tensor):
+            audio_data = self.wav.cpu().numpy()
+        else:
+            audio_data = self.wav
+        # Ensure float32 and normalize
+        if audio_data.dtype != np.float32:
+            audio_data = audio_data.astype(np.float32)
+        audio_data = np.clip(audio_data, -1.0, 1.0)
+        # Play the audio
+        sd.play(audio_data, self.sample_rate)
+        sd.wait()  # Wait until the audio is finished playing
+    def display(self) -> Optional[Audio]:
+        """Display audio player in Jupyter notebook.
+        Returns Audio widget if in notebook, None otherwise."""
+        try:
+            # Convert to bytes
+            audio_bytes = self.to_bytes(format='wav')
+            # Create and display audio widget
+            audio_widget = Audio(audio_bytes, rate=self.sample_rate, autoplay=False)
+            display(audio_widget)
+            return audio_widget
+        except Exception as e:
+            print(f"Could not display audio widget: {str(e)}")
+            print("Try using .play() method instead")
+            return None
+    def preview(self) -> None:
+        """Smart play method that chooses appropriate playback method."""
+        try:
+            # Try notebook display first
+            if self.display() is None:
+                # Fall back to sounddevice if not in notebook
+                self.play()
+        except Exception as e:
+            print(f"Error playing audio: {str(e)}")
+class Xtts(nn.Module):
     """Async XTTS model implementation using VLLM's AsyncEngine."""
     def __init__(self, hifi_config: XTTSConfig, gpt_config: XTTSGPTConfig, tensor_parallel_size: int = 1, **kwargs):
+        super().__init__()
         self.hifi_config = hifi_config
         self.gpt_config = gpt_config
+        self.mel_bos_token_id = gpt_config.start_audio_token
+        self.mel_eos_token_id = gpt_config.stop_audio_token
         self.tp = tensor_parallel_size
         self.tokenizer = XTTSTokenizerFast.from_pretrained("AstraMindAI/xtts2-gpt")
         self.request_counter = Counter()
         self.executor = ThreadPoolExecutor(max_workers=4)  # For CPU-bound tasks
+        self.hidden_states_collector = HiddenStatesCollector()
+        # Register buffer before creating modules
         self.register_buffer("mel_stats", torch.ones(80))
+        # Initialize all nn.Module components
+        self.conditioning_encoder = ConditioningEncoder(
+            gpt_config.audio_config.mel_channels,
+            gpt_config.hidden_size,
+            num_attn_heads=gpt_config.num_attention_heads
+        )
+        self.text_embedding = nn.Embedding(
+            gpt_config.number_text_tokens,
+            gpt_config.hidden_size
         )
+        self.text_pos_embedding = (
+            LearnedPositionEmbeddings(
+                gpt_config.max_text_tokens + 2,
+                gpt_config.hidden_size,
+                supports_pp=False
+            )
+            if gpt_config.max_audio_tokens != -1
+            else functools.partial(gpt_config.null_position_embeddings, dim=gpt_config.hidden_size)
+        )
+        if gpt_config.use_perceiver_resampler:
+            self.conditioning_perceiver = PerceiverResampler(
+                dim=gpt_config.hidden_size,
+                depth=2,
+                dim_context=gpt_config.hidden_size,
+                num_latents=32,
+                dim_head=64,
+                heads=8,
+                ff_mult=4,
+                use_flash_attn=False,
+            )
         # Initialize HiFi-GAN decoder
         self.hifigan_decoder = HifiDecoder(
             input_sample_rate=self.hifi_config.input_sample_rate,
             cond_d_vector_in_each_upsampling_layer=self.hifi_config.cond_d_vector_in_each_upsampling_layer,
         )
+        # Kept for model loading purposes
+        self.text_head = nn.Linear(gpt_config.hidden_size, gpt_config.number_text_tokens, bias=True)
+        self.final_norm = nn.LayerNorm(gpt_config.hidden_size, eps=1e-5, bias=True)
+        # Initialize VLLM engine at the end
+        self.init_vllm_engine()
+        # Semaphore for concurrency control
+        self.max_concurrency = 10
+        self.semaphore = asyncio.BoundedSemaphore(self.max_concurrency)
+    def half(self):
+        # We cannot permit downcasting since it will throw an error while padding
+        return
+    def to(self, *args, **kwargs):
+        # Block downcasting
+        dtype = kwargs.get('dtype', None)
+        if dtype == torch.float16 or dtype == torch.bfloat16:
+            kwargs['dtype'] = torch.float32
+        elif len(args) > 0 and (args[0] == torch.float16 or args[0] == torch.bfloat16):
+            args = list(args)
+            args[0] = torch.float32
+            args = tuple(args)
+        return super().to(*args, **kwargs)
+    @property
+    def device(self):
+        """Get the current device of the model."""
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        """Get the current dtype of the model."""
+        return next(self.parameters()).dtype
+    @staticmethod
+    def get_memory_percentage(memory: int) -> float:
+        """Get memory percentage."""
+        total_memory = torch.cuda.get_device_properties(0).total_memory
+        reserved_memory = torch.cuda.memory_reserved(0)
+        allocated_memory = torch.cuda.memory_allocated(0)
+        available_memory = total_memory - reserved_memory - allocated_memory
+        return memory / available_memory
+    def init_vllm_engine(self):
+        """Initialize models with AsyncVLLMEngine."""
+        engine_args = AsyncEngineArgs(
+            model="AstraMindAI/xtts2-gpt",
+            tensor_parallel_size=self.tp,
+            dtype="auto",
+            disable_log_stats=True,
+            max_model_len=self.gpt_config.max_text_tokens + self.gpt_config.max_audio_tokens,
+            gpu_memory_utilization=self.get_memory_percentage(3 * 1024 ** 3),
+            trust_remote_code=True,
+            enforce_eager=True,
+            limit_mm_per_prompt={"audio": 1},
+            max_num_batched_tokens=7296,
+        )
+        self.llm_engine = AsyncLLMEngine.from_engine_args(engine_args)
     @classmethod
     def from_pretrained(
             cls,
             pretrained_model_name_or_path: str,
+            torch_dtype: torch.dtype = torch.float32,
             device_map: Optional[str] = "auto",
             tensor_parallel_size: int = 1,
             **kwargs,
     ) -> "Xtts":
+        """Load pretrained XTTS model from HuggingFace Hub."""
         from huggingface_hub import hf_hub_download
         import json
         import os
         if not os.path.exists(pretrained_model_name_or_path):
             config_file = hf_hub_download(
                 repo_id=pretrained_model_name_or_path,
+                filename="config.json"
             )
             with open(config_file, 'r') as f:
                 config = json.load(f)
         else:
             # Load from local path
             with open(os.path.join(pretrained_model_name_or_path, "config.json"), 'r') as f:
                 config = json.load(f)
         # Initialize configs
+        gpt_config = XTTSGPTConfig(**config['gpt_config'])
         hifi_config = XTTSConfig(**config)
         # Initialize model
         # Load model weights
         if not os.path.exists(pretrained_model_name_or_path):
             hifigan_weights = hf_hub_download(
                 repo_id=pretrained_model_name_or_path,
+                filename="xtts-v2.safetensors"
             )
         else:
+            hifigan_weights = os.path.join(pretrained_model_name_or_path, "xtts-v2.safetensors")
         import safetensors.torch
         # Load HiFi-GAN weights
         hifigan_state = safetensors.torch.load_file(hifigan_weights)
+        model.load_state_dict(hifigan_state)
         # Set model properties
         model.config = config
         # Cast model to specified dtype
         model = model.to(torch_dtype)
+        model = model.to('cuda')
         return model
+    @staticmethod
+    def load_audio(audio_path: Union[str, Path], sampling_rate: int = 22050) -> torch.Tensor:
+        audio, lsr = torchaudio.load(audio_path)
+        # Stereo to mono if needed
+        if audio.size(0) != 1:
+            audio = torch.mean(audio, dim=0, keepdim=True)
+        if lsr != sampling_rate:
+            audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
+        # Clip audio invalid values
+        audio.clip_(-1, 1)
+        return audio
+    @torch.inference_mode()
+    def get_speaker_embedding(self, audio, sr):
+        audio_16k = torchaudio.functional.resample(audio, sr, 16000)
+        return (
+            self.hifigan_decoder.speaker_encoder.forward(audio_16k.to(self.device), l2_norm=True)
+            .unsqueeze(-1)
+            .to(self.device)
+        )
+    @torch.inference_mode()
+    def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
+        """Compute the conditioning latents for the GPT model from the given audio."""
+        if sr != 22050:
+            audio = torchaudio.functional.resample(audio, sr, 22050)
+        if length > 0:
+            audio = audio[:, : 22050 * length]
+        if self.gpt_config.use_perceiver_resampler:
+            style_embs = []
+            for i in range(0, audio.shape[1], 22050 * chunk_length):
+                audio_chunk = audio[:, i: i + 22050 * chunk_length]
+                # if the chunk is too short ignore it
+                if audio_chunk.size(-1) < 22050 * 0.33:
+                    continue
+                mel_chunk = wav_to_mel_cloning(
+                    audio_chunk,
+                    mel_norms=self.mel_stats.cpu(),
+                    n_fft=2048,
+                    hop_length=256,
+                    win_length=1024,
+                    power=2,
+                    normalized=False,
+                    sample_rate=22050,
+                    f_min=0,
+                    f_max=8000,
+                    n_mels=80,
+                )
+                style_emb = self.get_style_emb(mel_chunk.to(self.device), None)
+                style_embs.append(style_emb)
+            # mean style embedding
+            cond_latent = torch.stack(style_embs).mean(dim=0)
+        else:
+            mel = wav_to_mel_cloning(
+                audio,
+                mel_norms=self.mel_stats.cpu(),
+                n_fft=4096,
+                hop_length=1024,
+                win_length=4096,
+                power=2,
+                normalized=False,
+                sample_rate=22050,
+                f_min=0,
+                f_max=8000,
+                n_mels=80,
+            )
+            cond_latent = self.get_style_emb(mel.to(self.device))
+        return cond_latent.transpose(1, 2)
+    @torch.inference_mode()
+    def get_conditioning_latents(
+            self,
+            audio_path,
+            max_ref_length=30,
+            gpt_cond_len=6,
+            gpt_cond_chunk_len=6,
+            librosa_trim_db=None,
+            sound_norm_refs=False,
+            load_sr=22050,
+    ):
+        """Get the conditioning latents for the GPT model from the given audio."""
+        # Deal with multiple references
+        assert isinstance(audio_path, str) or isinstance(audio_path, list), "audio_path must be a string or a list."
+        if not isinstance(audio_path, list):
+            audio_paths = [audio_path]
+        else:
+            audio_paths = audio_path
+        speaker_embeddings = []
+        audios = []
+        for file_path in audio_paths:
+            audio = load_audio(file_path, load_sr)
+            audio = audio[:, : load_sr * max_ref_length].to(self.device).to(self.dtype)
+            if sound_norm_refs:
+                audio = (audio / torch.abs(audio).max()) * 0.75
+            if librosa_trim_db is not None:
+                audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0]
+            # Compute latents for the decoder
+            speaker_embedding = self.get_speaker_embedding(audio, load_sr)
+            speaker_embeddings.append(speaker_embedding)
+            audios.append(audio)
+        # Merge all the audios and compute the latents for the GPT
+        full_audio = torch.cat(audios, dim=-1)
+        gpt_cond_latents = self.get_gpt_cond_latents(
+            full_audio, load_sr, length=gpt_cond_len, chunk_length=gpt_cond_chunk_len
+        )  # [1, 1024, T]
+        speaker_embedding = torch.stack(speaker_embeddings)
+        speaker_embedding = speaker_embedding.mean(dim=0)
+        return gpt_cond_latents, speaker_embedding
+    def get_style_emb(self, cond_input: torch.Tensor, return_latent: bool = False) -> torch.Tensor:
+        """Get conditioning embeddings from mel spectrograms."""
+        if not return_latent:
+            if cond_input.ndim == 4:
+                cond_input = cond_input.squeeze(1)
+            conds = self.conditioning_encoder(cond_input)
+            if hasattr(self, 'conditioning_perceiver'):
+                conds = self.conditioning_perceiver(
+                    conds.permute(0, 2, 1)
+                ).transpose(1, 2)
+        else:
+            conds = cond_input.unsqueeze(1)
+        return conds
+    async def prepare_text_tokens_async(self, text: str, language: str, split_text=False) \
+            -> Tuple[List[Union[int, List[int]]], List[torch.Tensor]]:
+        """Prepare text tokens for the given text and language."""
+        async def elaborate_tokens(text_tokens: List[int]) -> torch.Tensor:
+            text_tokens.insert(0, self.tokenizer.bos_token_id)
+            text_tokens.append(self.tokenizer.eos_token_id)
+            return torch.tensor(text_tokens).unsqueeze(0).to(self.text_embedding.weight.device)
+        async def embed_tokens(text_tokens: Union[torch.Tensor, List[torch.Tensor]]) -> List[torch.Tensor]:
+            embeds = []
+            if isinstance(text_tokens, list):
+                for list_element in text_tokens:
+                    embeds.append(self.text_embedding(list_element) + self.text_pos_embedding(list_element))
+            else:
+                embeds.append(self.text_embedding(text_tokens) + self.text_pos_embedding(text_tokens))
+            return embeds
+        fake_tokens_for_audio_generation = []
+        if split_text:
+            text_tokens = self.tokenizer.batch_encode_with_split(text, lang=[language])
+            for idx, text_token in enumerate(text_tokens):
+                text_tokens[idx] = await elaborate_tokens(text_token)
+                fake_tokens_for_audio_generation.append([1] * len(text_token))
+        else:
+            text_tokens = self.tokenizer.batch_encode(text, lang=[language])
+            text_tokens = await elaborate_tokens(text_tokens)
+            fake_tokens_for_audio_generation = [1] * len(text_tokens)
+        return fake_tokens_for_audio_generation, await embed_tokens(text_tokens)
+    async def prepare_inputs_async(self, text: str, language: str, speaker_file: Union[str, Path],
+                                   max_ref_length: int, gpt_cond_len: int, gpt_cond_chunk_len: int, split_text: bool) \
+            -> Tuple[List[List[int]], List[torch.Tensor], torch.Tensor]:
+        """Prepare input text with conditioning tokens. Return combined conditioning latents"""
+        # Tokenize text based on the language
+        text_tokens, text_embeddings = await self.prepare_text_tokens_async(text, language, split_text)
+        # Load the speaker file and convert it to a tensor
+        gpt_cond_latent, speaker_embeddings = await self.get_conditioning_latents_async(
+            speaker_file,
+            max_ref_length,
+            gpt_cond_len,
+            gpt_cond_chunk_len
+        )
+        cond_latents = []
+        for text_embedding in text_embeddings:
+            # Concatenate along sequence dimension
+            cond_latents.append((torch.cat([gpt_cond_latent, text_embedding], dim=1).squeeze(0)
+                                 .to(self.llm_engine.engine.model_config.dtype)))
+        return text_tokens, cond_latents, speaker_embeddings
+    async def get_conditioning_latents_async(
+            self,
+            audio_path,
+            max_ref_length=30,
+            gpt_cond_len=6,
+            gpt_cond_chunk_len=6,
+            librosa_trim_db=None,
+            sound_norm_refs=False,
+            load_sr=22050,
+    ):
+        """Async version of get_conditioning_latents with concurrency control."""
+        async with self.semaphore:
+            # Run the original get_conditioning_latents in executor
+            result = await asyncio.get_event_loop().run_in_executor(
+                None,
+                functools.partial(self.get_conditioning_latents,
+                                  audio_path,
+                                  max_ref_length,
+                                  gpt_cond_len,
+                                  gpt_cond_chunk_len,
+                                  librosa_trim_db,
+                                  sound_norm_refs,
+                                  load_sr)
+            )
+        return result
+    async def get_model_logits(self, token_ids: List[int], conditioning: MultiModalDataDict) -> torch.Tensor:
+        """Get model logits for a specific request"""
+        request_id = uuid.uuid4().hex
+        # Add start and end tokens
+        token_ids = [self.mel_bos_token_id] + token_ids + [self.mel_eos_token_id] * 5
+        engine_inputs = TokensPrompt(prompt_token_ids=token_ids)
+        engine_inputs["multi_modal_data"] = conditioning
+        # Bind the collector to this request
+        bound_collector = self.hidden_states_collector.bind_to_request(request_id)
+        # Set up sampling parameters with the bound collector
+        sampling_params = ExtendedSamplingParams(
+            detokenize=False,
+            max_tokens=1,
+            hidden_state_collector=bound_collector,
         )
+        # Generate with unique request ID
+        generator = self.llm_engine.generate(
+            prompt=engine_inputs,
+            sampling_params=sampling_params,
+            request_id=request_id
+        )
+        # Consume the generator with a timeout
+        try:
+            async def consume_generator():
+                async for _ in generator:
+                    pass
+            await asyncio.wait_for(consume_generator(), timeout=300)
+        except asyncio.TimeoutError:
+            raise RuntimeError("Timeout while generating logits")
+        # Get the collected hidden states
+        hidden_states = self.hidden_states_collector.get_hidden_states(request_id)
+        if hidden_states is None:
+            raise RuntimeError(f"No hidden states collected for request {request_id}")
+        return hidden_states[-len(token_ids):, ...].unsqueeze(0).to(self.device).to(self.dtype)
+    async def process_tokens_to_speech(
+            self,
+            generators: List[AsyncGenerator[RequestOutput, None]],
+            speaker_embeddings: torch.Tensor,
+            multimodal_data: List[torch.Tensor],
+            chunk_size: int = 20,
+    ) -> AsyncGenerator[XTTSOutput, None]:
+        """
+        Process multiple token generators concurrently and emit results sequentially.
+        Uses a queue-based approach to handle multiple generators reliably.
+        """
+        # Create a queue for each generator to store its results
+        queues = [asyncio.Queue() for _ in generators]
+        # Create tasks for processing each generator
+        tasks = []
+        for i, generator in enumerate(generators):
+            task = asyncio.create_task(
+                self._process_single_generator(
+                    generator,
+                    queues[i],
+                    speaker_embeddings,
+                    multimodal_data[i],
+                    chunk_size
+                )
+            )
+            tasks.append(task)
+        try:
+            # Process queues in sequence
+            for i, queue in enumerate(queues):
+                while True:
+                    result = await queue.get()
+                    if result is None:
+                        # This generator has finished
+                        break
+                    else:
+                        yield result
+        finally:
+            # Ensure all tasks are properly cleaned up
+            for task in tasks:
+                if not task.done():
+                    task.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+    async def _process_single_generator(
+            self,
+            generator: AsyncGenerator[RequestOutput, None],
+            queue: asyncio.Queue,
+            speaker_embeddings: torch.Tensor,
+            gpt_embed_input: torch.Tensor,
+            chunk_size: int
+    ) -> None:
+        """Process a single generator and put results in its queue."""
+        try:
+            last_decoded_token = 0
+            accumulated_tokens = []
+            async for output in generator:
+                # Get new tokens
+                new_tokens = output.outputs[0].token_ids[last_decoded_token:]
+                accumulated_tokens.extend(new_tokens)
+                last_decoded_token = len(accumulated_tokens)
+                # Process tokens when we have enough or it's the final output
+                if output.finished:# or len(accumulated_tokens) >= chunk_size: se lascio con acculated token mi ripete gli stesis toke, why??
+                    # Process the accumulated tokens
+                    hidden_states = await self.get_model_logits(
+                        accumulated_tokens,
+                        {
+                            "audio": {
+                                'embeds': gpt_embed_input,
+                                "is_logits_only_mode": True
+                            }
+                        }
+                    )
+                    # Generate audio segment
+                    wav = await asyncio.get_event_loop().run_in_executor(
+                        self.executor,
+                        lambda: self.hifigan_decoder.inference(
+                            hidden_states,
+                            g=speaker_embeddings
+                        ).cpu().numpy().squeeze()
+                    )
+                    # Put result in queue
+                    await queue.put(XTTSOutput(
+                        request_id=output.request_id,
+                        wav=wav
+                    ))
+                    # Reset accumulated tokens
+                    accumulated_tokens = []
+                if output.finished:
+                    break
+        except Exception as e:
+            logging.error(f"Error in generator processing: {e}")
+        finally:
+            # Signal completion
+            await queue.put(None)
+    async def generate_speech_async_from_streaming_source(self, request: XTTSRequest) -> AsyncGenerator[XTTSOutput, None]:
+        """Generate speech for streaming source of text, making a streaming source of audio tokens and then decoding
+        and returning a streaming audio response."""
+        assert isinstance(request.text, AsyncGenerator), "Text must be an AsyncGenerator for streaming source."
+        # Prepare input with conditioning
+        gpt_cond_latent, speaker_embeddings = await self.get_conditioning_latents_async(
+            request.speaker_file,
+            request.max_ref_length,
+            request.gpt_cond_len,
+            request.gpt_cond_chunk_len
+        )
         sampling_params = SamplingParams(
             temperature=request.temperature,
             top_p=request.top_p,
+            detokenize=False,
             top_k=request.top_k,
+            logits_processors=[LogitsRepetitionPenalizer(request.repetition_penalty)],
+            repetition_penalty=1.0,  # Since we're handling repetition penalty manually
             max_tokens=self.gpt_config.gpt_max_audio_tokens,
+            ignore_eos=True,  # Ignore the tokenizer eos token since it is for textual generation
+            stop_token_ids=[self.mel_eos_token_id],
         )
+        accumulated_text = ""
+        async for text in request.text:
+            text = text.strip()
+            accumulated_text += text
+            if len(accumulated_text) > request.generate_every_n_chars:
+                tokens, embeddings = await self.prepare_text_tokens_async(accumulated_text, request.language)
+                gpt_embed_input = [torch.cat([gpt_cond_latent, embeddings[0]], dim=0)]
+                engine_inputs = TokensPrompt(prompt_token_ids=tokens)
+                if gpt_embed_input is not None:
+                    engine_inputs["multi_modal_data"] = {"audio": {"embeds": gpt_embed_input, "is_logits_only_mode": False}}
+                token_generator = [self.llm_engine.generate(
+                    prompt=engine_inputs,
+                    sampling_params=sampling_params,
+                    request_id=request.request_id,
+                )]
+                # Process tokens to speech
+                async for output in self.process_tokens_to_speech(
+                        token_generator,
+                        speaker_embeddings,
+                        gpt_embed_input,
+                        chunk_size=50
+                ):
+                    yield output
+                accumulated_text = ""
+    async def generate_speech_from_text_async(self, request: XTTSRequest) -> AsyncGenerator[XTTSOutput, None]:
+        """Generate speech for a single request asynchronously."""
+        # Prepare input with conditioning
+        tokens_list, gpt_embed_inputs, speaker_embeddings = await self.prepare_inputs_async(
+            request.text,
+            request.language,
+            request.speaker_file,
+            request.max_ref_length,
+            request.gpt_cond_len,
+            request.gpt_cond_chunk_len,
+            split_text=True  # Split text to avoid OOM on big texts
+        )
+        # Start all requests in parallel
+        generators = []
+        for seq_index, sequence in enumerate(tokens_list):
+            sampling_params = SamplingParams(
+                temperature=request.temperature,
+                top_p=request.top_p,
+                detokenize=False,
+                top_k=request.top_k,
+                logits_processors=[LogitsRepetitionPenalizer(request.repetition_penalty)],
+                repetition_penalty=1.0,  # Since we're handling repetition penalty manually
+                max_tokens=self.gpt_config.gpt_max_audio_tokens,
+                ignore_eos=True,  # Ignore the tokenizer eos token since it is for textual generation
+                stop_token_ids=[self.mel_eos_token_id],
             )
+            engine_inputs = TokensPrompt(prompt_token_ids=sequence)
+            if gpt_embed_inputs is not None:
+                engine_inputs["multi_modal_data"] = {"audio": {"embeds": gpt_embed_inputs[seq_index], "is_logits_only_mode": False}}
+            # Get audio token generator from VLLM
+            token_generator = self.llm_engine.generate(
+                prompt=engine_inputs,
+                sampling_params=sampling_params,
+                request_id=f"{request.request_id}_{seq_index}",
             )
+            generators.append(token_generator)
+        # Process tokens to speech
+        async for output in self.process_tokens_to_speech(
+                generators,
+                speaker_embeddings,
+                gpt_embed_inputs,
+                chunk_size=50
+        ):
+            yield output
+    def generate_speech_from_text(self, request: XTTSRequest) -> List[XTTSOutput]:
+        """
+        Synchronous wrapper for generate_speech_from_text_async.
+        Args:
+            request: XTTSRequest object containing generation parameters
+        Returns:
+            List of XTTSOutput containing the generated speech segments
+        """
+        async def _collect_outputs():
+            outputs = []
+            async for output in self.generate_speech_from_text_async(request):
+                outputs.append(output)
+            return outputs
+        # Run the async code in an event loop
+        import asyncio
+        # Get or create an event loop
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        if loop.is_running():
+            # Create a new loop if the current one is running
+            new_loop = asyncio.new_event_loop()
+            results = new_loop.run_until_complete(_collect_outputs())
+            new_loop.close()
+        else:
+            results = loop.run_until_complete(_collect_outputs())
+        return results

xttsv2_gpt2/config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "XttsGPT"
+  ],
+  "attn_pdrop": 0.1,
+  "audio_config": {
+    "mel_channels": 80,
+    "output_sample_rate": 24000,
+    "sample_rate": 22050
+  },
+  "auto_map": {
+    "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+    "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
+    "AutoTokenizer": "AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast"
+  },
+  "decoder_input_dim": 1024,
+  "enable_redaction": false,
+  "gpt_batch_size": 1,
+  "gpt_max_audio_tokens": 605,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "kv_cache": true,
+  "layer_norm_epsilon": 1e-05,
+  "max_audio_tokens": 605,
+  "max_prompt_tokens": 70,
+  "max_text_tokens": 402,
+  "model_type": "xtts_gpt",
+  "n_inner": 4096,
+  "num_attention_heads": 16,
+  "num_audio_tokens": 1026,
+  "num_hidden_layers": 30,
+  "number_text_tokens": 6681,
+  "reorder_and_upcast_attn": false,
+  "scale_attn_by_inverse_layer_idx": false,
+  "start_audio_token": 1024,
+  "start_text_token": null,
+  "stop_audio_token": 1025,
+  "stop_text_token": null,
+  "transformers_version": "4.46.0",
+  "use_masking_gt_prompt_approach": true,
+  "use_perceiver_resampler": true,
+  "vocab_size": 6681
+}

xttsv2_gpt2/gpt2_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:104d92b2297c243b64d1417bd5cfda015faca0a670e9bc90088eed0e844f8e35
+size 1522497936

xttsv2_gpt2/gpt_config.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from dataclasses import asdict, dataclass
+from typing import Dict, Optional, List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+@dataclass
+class GPTAudioConfig:
+    """Configuration for GPT audio processing parameters"""
+    mel_channels: int = 80
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+@dataclass
+class XTTSAudioConfig:
+    """Configuration for audio processing parameters"""
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+    mel_channels: int = 80
+    hop_length: int = 256
+    win_length: int = 1024
+    n_fft: int = 1024
+    fmin: int = 0
+    fmax: int = 8000
+    power: float = 1.0
+    mel_norms_file: Optional[str] = None
+class XTTSGPTConfig(PretrainedConfig):
+    """Configuration class for the GPT component of XTTS."""
+    model_type = "xtts_gpt"
+    def __init__(
+            self,
+            # Model architecture
+            hidden_size: int = 1024,  # gpt_n_model_channels in original
+            n_inner: int = 4096,
+            num_hidden_layers: int = 30,  # gpt_layers in original
+            num_attention_heads: int = 16,  # gpt_n_heads in original
+            # Tokenizer settings
+            vocab_size: int = 6681,  # gpt_number_text_tokens in original
+            number_text_tokens: int = 6681,  # Explicit text token vocabulary size
+            start_text_token: Optional[int] = None,
+            stop_text_token: Optional[int] = None,
+            # Audio token settings
+            num_audio_tokens: int = 1026,  # gpt_num_audio_tokens in original
+            start_audio_token: int = 1024,  # gpt_start_audio_token in original
+            stop_audio_token: int = 1025,  # gpt_stop_audio_token in original
+            # Sequence length settings
+            max_audio_tokens: int = 605,  # gpt_max_audio_tokens in original
+            max_text_tokens: int = 402,  # gpt_max_text_tokens in original
+            max_prompt_tokens: int = 70,  # gpt_max_prompt_tokens in original
+            gpt_max_audio_tokens: int = 605,  # Used for generation
+            # Model behavior settings
+            use_masking_gt_prompt_approach: bool = True,  # gpt_use_masking_gt_prompt_approach in original
+            use_perceiver_resampler: bool = True,  # gpt_use_perceiver_resampler in original
+            kv_cache: bool = True,
+            enable_redaction: bool = False,
+            # GPT batch settings
+            gpt_batch_size: int = 1,
+            # Audio processing
+            audio_config: Optional[Dict] = None,
+            # Architecture specifics
+            layer_norm_epsilon: float = 1e-5,
+            initializer_range: float = 0.02,
+            add_cross_attention: bool = False,
+            scale_attn_by_inverse_layer_idx: bool = False,
+            reorder_and_upcast_attn: bool = False,
+            # Size settings for the decoder
+            decoder_input_dim: int = 1024,
+            architectures=["XttsGPT"],
+            auto_map={
+                "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+                "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
+            },
+            activation_function: str = "gelu",
+            attn_pdrop: float = 0.1,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.auto_map = auto_map
+        self.audio_config = GPTAudioConfig(
+            **audio_config if audio_config is not None else {}
+        )
+        self.activation_function = activation_function
+        self.attn_pdrop = attn_pdrop
+        self.hidden_size = hidden_size
+        self.n_inner = n_inner
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.vocab_size = vocab_size
+        self.number_text_tokens = number_text_tokens
+        self.start_text_token = start_text_token
+        self.stop_text_token = stop_text_token
+        self.num_audio_tokens = num_audio_tokens
+        self.start_audio_token = start_audio_token
+        self.stop_audio_token = stop_audio_token
+        self.max_audio_tokens = max_audio_tokens
+        self.max_text_tokens = max_text_tokens
+        self.max_prompt_tokens = max_prompt_tokens
+        self.gpt_max_audio_tokens = gpt_max_audio_tokens
+        self.use_masking_gt_prompt_approach = use_masking_gt_prompt_approach
+        self.use_perceiver_resampler = use_perceiver_resampler
+        self.kv_cache = kv_cache
+        self.enable_redaction = enable_redaction
+        self.gpt_batch_size = gpt_batch_size
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.add_cross_attention = add_cross_attention
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.decoder_input_dim = decoder_input_dim
+    def to_dict(self) -> Dict:
+        """Convert the config to a dictionary."""
+        output = super().to_dict()
+        output["audio_config"] = asdict(self.audio_config)
+        return output
+    @classmethod
+    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSGPTConfig":
+        """Create a config from a dictionary."""
+        return cls(**config_dict)

xttsv2_gpt2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[START]",
+  "eos_token": "[STOP]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

xttsv2_gpt2/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

xttsv2_gpt2/tokenizer.py ADDED Viewed

	@@ -0,0 +1,887 @@

+import os
+import re
+import textwrap
+from typing import List, Optional, Union, Dict, Any
+from functools import cached_property
+import pypinyin
+import torch
+from hangul_romanize import Transliter
+from hangul_romanize.rule import academic
+from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
+from transformers import PreTrainedTokenizerFast, BatchEncoding
+from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
+from tokenizers import Tokenizer
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from tokenizers.processors import TemplateProcessing
+from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
+import cutlet
+# Funzioni di preprocessing del testo
+def get_spacy_lang(lang):
+    if lang == "zh":
+        return Chinese()
+    elif lang == "ja":
+        return Japanese()
+    elif lang == "ar":
+        return Arabic()
+    elif lang == "es":
+        return Spanish()
+    else:
+        # For most languages, English does the job
+        return English()
+def split_sentence(text, lang, text_split_length=250):
+    """Preprocess the input text and split into sentences based on language."""
+    text_splits = []
+    if text_split_length is not None and len(text) >= text_split_length:
+        text_splits.append("")
+        nlp = get_spacy_lang(lang)
+        nlp.add_pipe("sentencizer")
+        doc = nlp(text)
+        for sentence in doc.sents:
+            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
+                text_splits[-1] += " " + str(sentence)
+                text_splits[-1] = text_splits[-1].lstrip()
+            elif len(str(sentence)) > text_split_length:
+                for line in textwrap.wrap(
+                    str(sentence),
+                    width=text_split_length,
+                    drop_whitespace=True,
+                    break_on_hyphens=False,
+                    tabsize=1,
+                ):
+                    text_splits.append(str(line))
+            else:
+                text_splits.append(str(sentence))
+        if len(text_splits) > 1 and text_splits[0] == "":
+                del text_splits[0]
+    else:
+        text_splits = [text.lstrip()]
+    return text_splits
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = {
+    "en": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mrs", "misess"),
+            ("mr", "mister"),
+            ("dr", "doctor"),
+            ("st", "saint"),
+            ("co", "company"),
+            ("jr", "junior"),
+            ("maj", "major"),
+            ("gen", "general"),
+            ("drs", "doctors"),
+            ("rev", "reverend"),
+            ("lt", "lieutenant"),
+            ("hon", "honorable"),
+            ("sgt", "sergeant"),
+            ("capt", "captain"),
+            ("esq", "esquire"),
+            ("ltd", "limited"),
+            ("col", "colonel"),
+            ("ft", "fort"),
+        ]
+    ],
+    "es": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "señora"),
+            ("sr", "señor"),
+            ("dr", "doctor"),
+            ("dra", "doctora"),
+            ("st", "santo"),
+            ("co", "compañía"),
+            ("jr", "junior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "fr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mme", "madame"),
+            ("mr", "monsieur"),
+            ("dr", "docteur"),
+            ("st", "saint"),
+            ("co", "compagnie"),
+            ("jr", "junior"),
+            ("ltd", "limitée"),
+        ]
+    ],
+    "de": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("fr", "frau"),
+            ("dr", "doktor"),
+            ("st", "sankt"),
+            ("co", "firma"),
+            ("jr", "junior"),
+        ]
+    ],
+    "pt": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "senhora"),
+            ("sr", "senhor"),
+            ("dr", "doutor"),
+            ("dra", "doutora"),
+            ("st", "santo"),
+            ("co", "companhia"),
+            ("jr", "júnior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "it": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # ("sig.ra", "signora"),
+            ("sig", "signore"),
+            ("dr", "dottore"),
+            ("st", "santo"),
+            ("co", "compagnia"),
+            ("jr", "junior"),
+            ("ltd", "limitata"),
+        ]
+    ],
+    "pl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("p", "pani"),
+            ("m", "pan"),
+            ("dr", "doktor"),
+            ("sw", "święty"),
+            ("jr", "junior"),
+        ]
+    ],
+    "ar": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # There are not many common abbreviations in Arabic as in English.
+        ]
+    ],
+    "zh": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+    "cs": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("ing", "inženýr"),  # engineer
+            ("p", "pan"),  # Could also map to pani for woman but no easy way to do it
+            # Other abbreviations would be specialized and not as common.
+        ]
+    ],
+    "ru": [
+        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("г-жа", "госпожа"),  # Mrs.
+            ("г-н", "господин"),  # Mr.
+            ("д-р", "доктор"),  # doctor
+            # Other abbreviations are less common or specialized.
+        ]
+    ],
+    "nl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dhr", "de heer"),  # Mr.
+            ("mevr", "mevrouw"),  # Mrs.
+            ("dr", "dokter"),  # doctor
+            ("jhr", "jonkheer"),  # young lord or nobleman
+            # Dutch uses more abbreviations, but these are the most common ones.
+        ]
+    ],
+    "tr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("b", "bay"),  # Mr.
+            ("byk", "büyük"),  # büyük
+            ("dr", "doktor"),  # doctor
+            # Add other Turkish abbreviations here if needed.
+        ]
+    ],
+    "hu": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("b", "bácsi"),  # Mr.
+            ("nőv", "nővér"),  # nurse
+            # Add other Hungarian abbreviations here if needed.
+        ]
+    ],
+    "ko": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+}
+def expand_abbreviations_multilingual(text, lang="en"):
+    if lang in _abbreviations:
+        for regex, replacement in _abbreviations[lang]:
+            text = re.sub(regex, replacement, text)
+    return text
+_symbols_multilingual = {
+    "en": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " and "),
+            ("@", " at "),
+            ("%", " percent "),
+            ("#", " hash "),
+            ("$", " dollar "),
+            ("£", " pound "),
+            ("°", " degree "),
+        ]
+    ],
+    "es": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " y "),
+            ("@", " arroba "),
+            ("%", " por ciento "),
+            ("#", " numeral "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " grados "),
+        ]
+    ],
+    "fr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " et "),
+            ("@", " arobase "),
+            ("%", " pour cent "),
+            ("#", " dièse "),
+            ("$", " dollar "),
+            ("£", " livre "),
+            ("°", " degrés "),
+        ]
+    ],
+    "de": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " und "),
+            ("@", " at "),
+            ("%", " prozent "),
+            ("#", " raute "),
+            ("$", " dollar "),
+            ("£", " pfund "),
+            ("°", " grad "),
+        ]
+    ],
+    "pt": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " arroba "),
+            ("%", " por cento "),
+            ("#", " cardinal "),
+            ("$", " dólar "),
+            ("£", " libra "),
+            ("°", " graus "),
+        ]
+    ],
+    "it": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " chiocciola "),
+            ("%", " per cento "),
+            ("#", " cancelletto "),
+            ("$", " dollaro "),
+            ("£", " sterlina "),
+            ("°", " gradi "),
+        ]
+    ],
+    "pl": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " i "),
+            ("@", " małpa "),
+            ("%", " procent "),
+            ("#", " krzyżyk "),
+            ("$", " dolar "),
+            ("£", " funt "),
+            ("°", " stopnie "),
+        ]
+    ],
+    "ar": [
+        # Arabic
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " و "),
+            ("@", " على "),
+            ("%", " في المئة "),
+            ("#", " رقم "),
+            ("$", " دولار "),
+            ("£", " جنيه "),
+            ("°", " درجة "),
+        ]
+    ],
+    "zh": [
+        # Chinese
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 和 "),
+            ("@", " 在 "),
+            ("%", " 百分之 "),
+            ("#", " 号 "),
+            ("$", " 美元 "),
+            ("£", " 英镑 "),
+            ("°", " 度 "),
+        ]
+    ],
+    "cs": [
+        # Czech
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " a "),
+            ("@", " na "),
+            ("%", " procento "),
+            ("#", " křížek "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " stupně "),
+        ]
+    ],
+    "ru": [
+        # Russian
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " и "),
+            ("@", " собака "),
+            ("%", " процентов "),
+            ("#", " номер "),
+            ("$", " доллар "),
+            ("£", " фунт "),
+            ("°", " градус "),
+        ]
+    ],
+    "nl": [
+        # Dutch
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " en "),
+            ("@", " bij "),
+            ("%", " procent "),
+            ("#", " hekje "),
+            ("$", " dollar "),
+            ("£", " pond "),
+            ("°", " graden "),
+        ]
+    ],
+    "tr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " ve "),
+            ("@", " at "),
+            ("%", " yüzde "),
+            ("#", " diyez "),
+            ("$", " dolar "),
+            ("£", " sterlin "),
+            ("°", " derece "),
+        ]
+    ],
+    "hu": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " és "),
+            ("@", " kukac "),
+            ("%", " százalék "),
+            ("#", " kettőskereszt "),
+            ("$", " dollár "),
+            ("£", " font "),
+            ("°", " fok "),
+        ]
+    ],
+    "ko": [
+        # Korean
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 그리고 "),
+            ("@", " 에 "),
+            ("%", " 퍼센트 "),
+            ("#", " 번호 "),
+            ("$", " 달러 "),
+            ("£", " 파운드 "),
+            ("°", " 도 "),
+        ]
+    ],
+}
+def expand_symbols_multilingual(text, lang="en"):
+    if lang in _symbols_multilingual:
+        for regex, replacement in _symbols_multilingual[lang]:
+            text = re.sub(regex, replacement, text)
+            text = text.replace("  ", " ")  # Ensure there are no double spaces
+    return text.strip()
+_ordinal_re = {
+    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
+    "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
+    "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
+    "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
+    "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
+    "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
+    "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
+    "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
+    "cs": re.compile(r"([0-9]+)\.(?=\s|$)"),  # In Czech, a dot is often used after the number to indicate ordinals.
+    "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
+    "nl": re.compile(r"([0-9]+)(de|ste|e)"),
+    "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
+    "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
+    "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
+}
+_number_re = re.compile(r"[0-9]+")
+_currency_re = {
+    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
+    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
+    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
+}
+_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
+_dot_number_re = re.compile(r"\b\d{1,3}(\.\d{3})*(\,\d+)?\b")
+_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
+def _remove_commas(m):
+    text = m.group(0)
+    if "," in text:
+        text = text.replace(",", "")
+    return text
+def _remove_dots(m):
+    text = m.group(0)
+    if "." in text:
+        text = text.replace(".", "")
+    return text
+def _expand_decimal_point(m, lang="en"):
+    amount = m.group(1).replace(",", ".")
+    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+def _expand_currency(m, lang="en", currency="USD"):
+    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+    and_equivalents = {
+        "en": ", ",
+        "es": " con ",
+        "fr": " et ",
+        "de": " und ",
+        "pt": " e ",
+        "it": " e ",
+        "pl": ", ",
+        "cs": ", ",
+        "ru": ", ",
+        "nl": ", ",
+        "ar": ", ",
+        "tr": ", ",
+        "hu": ", ",
+        "ko": ", ",
+    }
+    if amount.is_integer():
+        last_and = full_amount.rfind(and_equivalents.get(lang, ", "))
+        if last_and != -1:
+            full_amount = full_amount[:last_and]
+    return full_amount
+def _expand_ordinal(m, lang="en"):
+    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+def _expand_number(m, lang="en"):
+    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+def expand_numbers_multilingual(text, lang="en"):
+    if lang == "zh":
+        text = zh_num2words()(text)
+    else:
+        if lang in ["en", "ru"]:
+            text = re.sub(_comma_number_re, _remove_commas, text)
+        else:
+            text = re.sub(_dot_number_re, _remove_dots, text)
+        try:
+            text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
+            text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
+            text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
+        except Exception as e:
+            pass
+        if lang != "tr":
+            text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
+        if lang in _ordinal_re:
+            text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
+        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def multilingual_cleaners(text, lang):
+    text = text.replace('"', "")
+    if lang == "tr":
+        text = text.replace("İ", "i")
+        text = text.replace("Ö", "ö")
+        text = text.replace("Ü", "ü")
+    text = lowercase(text)
+    text = expand_numbers_multilingual(text, lang)
+    text = expand_abbreviations_multilingual(text, lang)
+    text = expand_symbols_multilingual(text, lang=lang)
+    text = collapse_whitespace(text)
+    return text
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def chinese_transliterate(text):
+    return "".join(
+        [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
+    )
+def japanese_cleaners(text, katsu):
+    text = katsu.romaji(text)
+    text = lowercase(text)
+    return text
+def korean_transliterate(text, transliter):
+    return transliter.translit(text)
+# Fast Tokenizer Class
+class XTTSTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
+    """
+    def __init__(
+            self,
+            vocab_file: str = None,
+            tokenizer_object: Optional[Tokenizer] = None,
+            unk_token: str = "[UNK]",
+            pad_token: str = "[PAD]",
+            bos_token: str = "[START]",
+            eos_token: str = "[STOP]",
+            auto_map: dict = {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", None]},
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ):
+        if tokenizer_object is None and vocab_file is not None:
+            tokenizer_object = Tokenizer.from_file(vocab_file)
+        if tokenizer_object is not None:
+            # Configure the tokenizer
+            tokenizer_object.pre_tokenizer = WhitespaceSplit()
+            tokenizer_object.post_processor = TemplateProcessing(
+                single=f"{bos_token} $A {eos_token}",
+                special_tokens=[
+                    (bos_token, tokenizer_object.token_to_id(bos_token)),
+                    (eos_token, tokenizer_object.token_to_id(eos_token)),
+                ],
+            )
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs
+        )
+        # Character limits per language
+        self.char_limits = {
+            "en": 250, "de": 253, "fr": 273, "es": 239,
+            "it": 213, "pt": 203, "pl": 224, "zh": 82,
+            "ar": 166, "cs": 186, "ru": 182, "nl": 251,
+            "tr": 226, "ja": 71, "hu": 224, "ko": 95,
+        }
+        # Initialize language tools
+        self._katsu = None
+        self._korean_transliter = Transliter(academic)
+        # Ensure pad_token_id is set
+        if self.pad_token_id is None:
+            self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
+    @cached_property
+    def katsu(self):
+        if self._katsu is None:
+            self._katsu = cutlet.Cutlet()
+        return self._katsu
+    def preprocess_text(self, text: str, lang: str) -> str:
+        """Apply text preprocessing for language"""
+        base_lang = lang.split("-")[0]  # remove region
+        if base_lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
+                         "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+            text = multilingual_cleaners(text, base_lang)
+            if base_lang == "zh":
+                text = chinese_transliterate(text)
+            if base_lang == "ko":
+                text = korean_transliterate(text, self._korean_transliter)
+        elif base_lang == "ja":
+            text = japanese_cleaners(text, self.katsu)
+        else:
+            text = basic_cleaners(text)
+        return text
+    def batch_encode_with_split(self, texts: Union[str, List[str]], lang: Union[str, List[str]],
+                                **kwargs) -> torch.Tensor:
+        """
+        Split texts into smaller chunks based on language character limits and encode them using HuggingFace fast tokenizer.
+        """
+        # Convert single inputs to lists
+        if isinstance(texts, str):
+            texts = [texts]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(texts) > 1:
+            lang = lang * len(texts)
+        # Check if texts and lang have the same length
+        if len(texts) != len(lang):
+            raise ValueError(f"Number of texts ({len(texts)}) does not match number of languages ({len(lang)}).")
+        batch_chunks = []
+        max_splits = 0
+        # For each text, split into chunks based on character limit
+        for text, text_lang in zip(texts, lang):
+            # Get language character limit
+            base_lang = text_lang.split("-")[0]
+            char_limit = self.char_limits.get(base_lang, 250)
+            # Clean and preprocess
+            text = self.preprocess_text(text, text_lang)
+            # Split text into sentences/chunks based on language
+            chunks = split_sentence(text, base_lang, text_split_length=char_limit)
+            # Format each chunk
+            formatted_chunks = []
+            for chunk in chunks:
+                lang_code = "zh-cn" if base_lang == "zh" else base_lang
+                formatted_chunk = f"[{lang_code}]{chunk}"
+                formatted_chunk = formatted_chunk.replace(" ", "[SPACE]")
+                formatted_chunks.append(formatted_chunk)
+            batch_chunks.append(formatted_chunks)
+            max_splits = max(max_splits, len(formatted_chunks))
+        # Flatten all chunks to a single list for batch encoding
+        all_chunks = [chunk for chunks in batch_chunks for chunk in chunks]
+        # Ensure the tokenizer is a fast tokenizer
+        if not self.is_fast:
+            raise ValueError("The tokenizer must be a fast tokenizer.")
+        # Encode all chunks using the fast tokenizer
+        encoding: BatchEncoding = self(
+            all_chunks,
+            add_special_tokens=False,
+            padding=True,
+            return_tensors='pt',
+            **kwargs
+        )
+        # The 'input_ids' tensor will have shape [total_chunks, max_sequence_length]
+        input_ids = encoding['input_ids']  # Tensor of shape [total_chunks, sequence_length]
+        # Now, we need to organize this tensor back into the desired shape
+        # We'll use 'batch_indices' to keep track of which chunks belong to which text
+        batch_indices = []
+        idx = 0
+        for chunks in batch_chunks:
+            batch_indices.append((idx, idx + len(chunks)))
+            idx += len(chunks)
+        # Determine max sequence length and add space for special tokens
+        max_seq_length = input_ids.size(1) + 2  # +2 for BOS and EOS tokens
+        # Prepare the final tensor
+        batch_size = len(texts)
+        padded_batch = torch.full(
+            (batch_size, max_splits, max_seq_length),
+            fill_value=self.pad_token_id,
+            dtype=torch.long
+        )
+        # Populate the final tensor with BOS and EOS tokens
+        for i, (start, end) in enumerate(batch_indices):
+            chunks_input_ids = input_ids[start:end]
+            num_chunks = chunks_input_ids.size(0)
+            for j in range(num_chunks):
+                sequence = chunks_input_ids[j]
+                # find the length of the sequence
+                seq_len = (sequence != self.pad_token_id).sum().item()
+                # insert BOS
+                padded_batch[i, j, 0] = self.bos_token_id
+                # insert sequence
+                padded_batch[i, j, 1:seq_len + 1] = sequence[:seq_len]
+                # insert EOS
+                padded_batch[i, j, seq_len + 1] = self.eos_token_id
+        return padded_batch
+    def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs,
+            add_special_tokens: bool = True,
+            padding_strategy=PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Override batch encoding to handle language-specific preprocessing
+        """
+        lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(batch_text_or_text_pairs) > 1:
+            lang = lang * len(batch_text_or_text_pairs)
+        # Check if batch_text_or_text_pairs and lang have the same length
+        if len(batch_text_or_text_pairs) != len(lang):
+            raise ValueError(f"Number of texts ({len(batch_text_or_text_pairs)}) does not match number of languages ({len(lang)}).")
+        # Preprocess each text in the batch with its corresponding language
+        processed_texts = []
+        for text, text_lang in zip(batch_text_or_text_pairs, lang):
+            if isinstance(text, str):
+                # Check length and preprocess
+                #self.check_input_length(text, text_lang)
+                processed_text = self.preprocess_text(text, text_lang)
+                # Format text with language tag and spaces
+                base_lang = text_lang.split("-")[0]
+                lang_code = "zh-cn" if base_lang == "zh" else base_lang
+                processed_text = f"[{lang_code}]{processed_text}"
+                processed_text = processed_text.replace(" ", "[SPACE]")
+                processed_texts.append(processed_text)
+            else:
+                processed_texts.append(text)
+        # Call the parent class's encoding method with processed texts
+        return super()._batch_encode_plus(
+            processed_texts,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs
+        )
+    def __call__(
+            self,
+            text: Union[str, List[str]],
+            lang: Union[str, List[str]] = "en",
+            add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = False,
+            truncation: Union[bool, str, TruncationStrategy] = False,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = True,
+            **kwargs
+    ):
+        """
+        Main tokenization method
+        """
+        # Convert single string to list for batch processing
+        if isinstance(text, str):
+            text = [text]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(text) > 1:
+            lang = lang * len(text)
+        # Ensure text and lang lists have same length
+        if len(text) != len(lang):
+            raise ValueError(f"Number of texts ({len(text)}) does not match number of languages ({len(lang)}).")
+        # Convert padding strategy
+        if isinstance(padding, bool):
+            padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
+        else:
+            padding_strategy = PaddingStrategy(padding)
+        # Convert truncation strategy
+        if isinstance(truncation, bool):
+            truncation_strategy = TruncationStrategy.LONGEST_FIRST if truncation else TruncationStrategy.DO_NOT_TRUNCATE
+        else:
+            truncation_strategy = TruncationStrategy(truncation)
+        # Use the batch encoding method
+        encoded = self._batch_encode_plus(
+            text,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            lang=lang,
+            **kwargs
+        )
+        return encoded

xttsv2_gpt2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,192 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[STOP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SPACE]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "259": {
+      "content": "[en]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "260": {
+      "content": "[de]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "261": {
+      "content": "[START]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "262": {
+      "content": "[fr]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "267": {
+      "content": "[ru]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "284": {
+      "content": "[es]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "285": {
+      "content": "[it]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "286": {
+      "content": "[pt]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "293": {
+      "content": "[cs]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "294": {
+      "content": "[pl]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "295": {
+      "content": "[tr]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "297": {
+      "content": "[nl]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5022": {
+      "content": "[ar]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5023": {
+      "content": "[zh-cn]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5412": {
+      "content": "[ja]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5753": {
+      "content": "[hu]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6152": {
+      "content": "[ko]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6680": {
+      "content": "[hi]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6681": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", null]},
+  "bos_token": "[START]",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "[STOP]",
+  "max_length": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "tokenizer_class": "XTTSTokenizerFast",
+  "unk_token": "[UNK]"
+}

xttsv2_gpt2/xtts2_gpt_modeling.py ADDED Viewed

	@@ -0,0 +1,505 @@

+import functools
+import math
+import random
+import uuid
+from array import array
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import List, Optional, Union, Iterable, Tuple, Mapping, Dict
+from torch import Tensor
+from transformers import PretrainedConfig, GPT2Config
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.distributed import get_pp_group
+from vllm.inputs import InputContext, INPUT_REGISTRY, DecoderOnlyInputs, token_inputs
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.gpt2 import GPT2Block
+from vllm.model_executor.models.utils import make_layers, make_empty_intermediate_tensors_factory
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.sequence import IntermediateTensors, SequenceData, VLLM_TOKEN_ID_ARRAY_TYPE
+from vllm.model_executor.models.interfaces import SupportsMultiModal, SupportsPP
+class LearnedPositionEmbeddings(nn.Module):
+    def __init__(self, seq_len, model_dim, init=0.02, relative=False, supports_pp=False):
+        super().__init__()
+        # nn.Embedding
+        self.emb = VocabParallelEmbedding(seq_len, model_dim) if supports_pp else nn.Embedding(seq_len, model_dim)
+        # Initializing this way is standard for GPT-2
+        self.emb.weight.data.normal_(mean=0.0, std=init)
+        self.relative = relative
+        self.seq_len = seq_len
+    def forward(self, x):
+        sl = x.shape[1]
+        if self.relative:
+            start = random.randint(sl, self.seq_len) - sl
+            return self.emb(torch.arange(start, start + sl, device=x.device))
+        else:
+            return self.emb(torch.arange(0, sl, device=x.device))
+    def get_fixed_embedding(self, ind: torch.Tensor, dev: torch.device) -> torch.Tensor:
+        """Get position embeddings with batch support.
+        Handles both single and batched inputs, returning embeddings that can be
+        directly added to input embeddings of the same shape.
+        Args:
+            ind: Position indices tensor. Can be single or batched
+                 Shape: [..., seq_len] or [seq_len]
+            dev: Target device for the embeddings
+        Returns:
+            Position embeddings tensor matching input shape plus embedding dimension
+            Shape: [batch_size, seq_len, model_dim] or [1, 1, model_dim]
+        Example:
+            >>> pos_emb = LearnedPositionEmbeddings(100, 64)
+            >>> # Batched input
+            >>> batch_indices = torch.zeros((3, 5))  # batch_size=3, seq_len=5
+            >>> embeddings = pos_emb.get_fixed_embedding(batch_indices, 'cuda')
+            >>> embeddings.shape  # Returns: [3, 5, 64]
+        """
+        if ind.shape[0] > 1:
+            pos_embeddings = []
+            for index in ind:
+                # Create embeddings for each position in the sequence
+                pos_embeddings.append(self.emb(index))
+            # Shape: [1, seq_len, model_dim] -> [batch_size, seq_len, model_dim]
+            return torch.stack(pos_embeddings, dim=0)
+        else:
+            # Handle single input
+            # Shape: [1, 1, model_dim]
+            return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
+def get_xtts_max_audio_tokens(ctx: InputContext) -> int:
+    """Calculate maximum audio tokens based on text context and audio duration."""
+    # Based on GPT config and XTTSv2 settings
+    return 608
+def dummy_seq_data_for_xtts(
+        ctx: InputContext,
+        seq_len: int,
+        audio_count: int,
+) -> SequenceData:
+    """Create dummy sequence data for XTTS profiling."""
+    # Calculate audio token space needed
+    max_audio_token_conditioning = ctx.model_config.hf_config.max_prompt_tokens # in xtts prompt = voice conditioning
+    audio_placeholder = array(
+        VLLM_TOKEN_ID_ARRAY_TYPE,
+        [1]
+    ) * max_audio_token_conditioning
+    # Add separator between chunks
+    audio_token_ids = (audio_placeholder + array(VLLM_TOKEN_ID_ARRAY_TYPE, [1])) * audio_count
+    # Fill remaining sequence with padding
+    other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [1]) * (seq_len - len(audio_token_ids))
+    # not -1 since we add the start audio token
+    return SequenceData(
+        audio_token_ids +
+        other_token_ids
+    )
+def dummy_conditioning_for_xtts(
+        ctx: InputContext,
+        seq_len: int,
+        audio_count: int,
+) -> dict:
+    """Create dummy conditioning data for XTTS."""
+    return {
+        "audio": {
+            "embeds":[
+            torch.zeros(
+                (seq_len, ctx.model_config.hf_config.hidden_size),
+                dtype=ctx.model_config.dtype) for _ in range(audio_count)
+        ],
+            "is_logits_only_mode": False,
+        }
+    }
+def dummy_data_for_xtts(
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+) -> Tuple[SequenceData, dict]:
+    """Create complete dummy data for XTTS profiling."""
+    audio_count = mm_counts["audio"]
+    seq_data = dummy_seq_data_for_xtts(ctx, seq_len, audio_count)
+    cond_data = dummy_conditioning_for_xtts(ctx, seq_len, audio_count)
+    return seq_data, cond_data
+def input_mapper_for_xtts(ctx: InputContext, data: Union[Dict, List[Tensor]]) -> MultiModalInputs:
+    """Map input data to XTTS format."""
+    assert isinstance(data, dict), "XTTS MultiModal input data must be a dictionary with keys: 'embeds', 'is_logits_only_mode'"
+    embeds = data.get("embeds")
+    is_logits_only_mode = data.get("is_logits_only_mode", False)
+    # Each item should be a torch tensor
+    for audio_input in embeds:
+        if not isinstance(audio_input, Tensor):
+            raise NotImplementedError(f"Unsupported data type: {type(audio_input)}")
+    return MultiModalInputs({"cond_latents": embeds,
+                             "is_logits_only_mode": is_logits_only_mode,
+                             })
+def input_processor_for_xtts2_gpt(ctx: InputContext, inputs: DecoderOnlyInputs):
+    """
+    We'll accomodate for the extra contditioning token and for the start audio token,
+    we actually insert a -1 repeated for the differecne in length between the conditioning and the tokenized text
+    and then we add 1 for the start audio token
+    Args:
+        ctx:
+        inputs:
+    Returns:
+    """
+    multi_modal_data = inputs.get("multi_modal_data")
+    audio_dict = multi_modal_data['audio']
+    audio = audio_dict.get('embeds')
+    is_last_decoding_pass = audio_dict.get("is_logits_only_mode", False)
+    prompt_token_ids = inputs.get("prompt_token_ids")
+    if not is_last_decoding_pass:
+        # we fill everything with 0 since we don't actually needs text token ids, it would mess up in the sampling step
+        new_token_ids = [1] * (audio.shape[0] + 1) # +1 for the start audio generation token
+    else:
+        new_token_ids = ([1] * audio.shape[0]) + prompt_token_ids
+    # the encoding had already been done externally to reuse the embeddings for later use but we
+    # account for the new token that will be added before generation
+    new_prompt = None
+    return token_inputs(prompt_token_ids=new_token_ids,
+                 prompt=new_prompt,
+                 multi_modal_data=multi_modal_data)
+@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_xtts)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("audio", get_xtts_max_audio_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_xtts)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_xtts2_gpt)
+class XttsGPT(nn.Module, SupportsMultiModal, SupportsPP):
+    def __init__(
+            self,
+            config: PretrainedConfig,
+            multimodal_config: MultiModalConfig,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        # Core GPT components
+        self.gpt = GPT2Model(
+            config,
+            cache_config,
+            quant_config,
+            prefix="gpt"
+        )
+        self.final_norm =  nn.LayerNorm(config.hidden_size, bias=True, eps=config.layer_norm_epsilon)
+        # Output head for mel tokens
+        self.mel_head = ParallelLMHead(
+            config.num_audio_tokens,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix="mel_head"
+        )
+        self.audio_start_generation_token = config.start_audio_token
+        # Initialize logits processor and sampler
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(config.num_audio_tokens,
+                                                config.num_audio_tokens,
+                                                logit_scale)
+        self.sampler = Sampler()
+    @staticmethod
+    def check_is_logits_only_mode(is_logits_only_mode):
+        # First check if it's a boolean
+        if isinstance(is_logits_only_mode, bool):
+            return is_logits_only_mode
+        # Then check if it's a tensor
+        if torch.is_tensor(is_logits_only_mode):
+            # if it's a scalar tensor, return the value
+            if is_logits_only_mode.numel() == 1:
+                return bool(is_logits_only_mode.item())
+            # for non-scalar tensors, check if all elements are the same
+            return is_logits_only_mode.any()
+        # Fallback
+        return bool(is_logits_only_mode)
+    def _calculate_start_token_indices(self, cond_latents: List[torch.Tensor]) -> List[int]:
+        """Calcola gli indici dove inserire i token di start.
+        Args:
+            cond_latents: Lista di tensori di condizionamento
+        Returns:
+            Lista di indici dove inserire i token di start
+        """
+        indices = []
+        current_idx = 0
+        for cond_latent in cond_latents:
+            # Aggiungi la lunghezza del segmento corrente
+            current_idx += cond_latent.shape[0]
+            # Aggiungi l'indice per il token di start dopo questo segmento
+            indices.append(current_idx)
+            # Incrementa per il token di start che verrà aggiunto
+            current_idx += 1
+        return indices
+    # noinspection PyMethodOverriding
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+            intermediate_tensors: Optional["IntermediateTensors"] = None,
+            cond_latents: Optional[torch.Tensor] = None,
+            is_logits_only_mode: bool = False,
+            **kwargs,
+    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+        """Forward pass following VLLM pattern."""
+        # it is not the first iter either if the cond latents are emtpy or if the kv_caches are not empty
+        is_first_iteration = (input_ids==1).all()
+        #assert len(input_ids) == 1 or (cond_latents is not None and not is_first_iteration), "Conditioning data (voice conditioning+text_embeddings) is required for XTTS"
+        is_logits_only_mode = self.check_is_logits_only_mode(is_logits_only_mode)
+        if is_first_iteration:
+            # we add it to enable the model to start the generation
+            input_ids[-1] = self.audio_start_generation_token
+        hidden_states = self.gpt(
+            input_ids=input_ids,
+            position_ids=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            # this is the conditioning input ( voice conditioning + text_embeds )
+            input_embeds=cond_latents,
+            is_first_iteration=is_first_iteration,
+            is_logits_only_mode=is_logits_only_mode
+        )
+        return hidden_states
+    def compute_logits(
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        # normalize the hidden states
+        hidden_states = self.final_norm(hidden_states)
+        # Check if we need to collect hidden states
+        sampling_params = sampling_metadata.seq_groups[0].sampling_params
+        if hasattr(sampling_params, 'hidden_state_collector'):
+            # Call the collector directly with the hidden states
+            sampling_params.hidden_state_collector(hidden_states, None)  # The request_id is already bound
+        # Compute logits using the mel_head
+        logits = self.logits_processor(self.mel_head, hidden_states, sampling_metadata)
+        return logits
+    def sample(
+            self,
+            logits: torch.Tensor,
+            sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights following VLLM pattern."""
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_names = set()
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                #print(f"Skipping loading of {name} bc it is not found") # used to check if all weights were loaded
+                continue
+            param = params_dict[name]
+            if "c_attn" in name or "c_proj" in name or "c_fc" in name:
+                if name.endswith(".weight"):
+                    loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_names.add(name)
+        # used to check if all weights were loaded
+        assert set(params_dict.keys()) - loaded_names == set(), \
+            (f"Missing weights: {set(params_dict.keys()) - loaded_names}, "
+             f"this probably means you are using an incompatible model ")
+class GPT2Model(nn.Module):
+    def __init__(
+            self,
+            config: GPT2Config,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
+        self.embed_dim = config.hidden_size
+        self.wte = VocabParallelEmbedding(config.num_audio_tokens, self.embed_dim)
+        self.wpe = (
+            LearnedPositionEmbeddings(config.max_audio_tokens + 3, config.decoder_input_dim)
+            if config.max_audio_tokens != -1
+            else functools.partial(config.null_position_embeddings, dim=config.decoder_input_dim)
+        )
+        self.start_layer, self.end_layer, self.h = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GPT2Block(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.h")
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            position_ids: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+            intermediate_tensors: Optional[IntermediateTensors],
+            # we pass this so that we can concatenate the text and conditioning input
+            input_embeds: Optional[torch.Tensor] = None,
+            is_first_iteration: bool = False,
+            is_logits_only_mode: bool = False,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            # if we are not doing the final conversion from token to latent and it is first pass(prefill)
+            if is_first_iteration and not is_logits_only_mode:
+                input_ids = input_ids[-1].reshape(1, 1)
+            elif is_logits_only_mode:
+                # we remove the contidioning input and keep just the audio token
+                if isinstance(input_embeds, list):
+                    starting_idx = []
+                    for input_embed in input_embeds:
+                        starting_idx.append(input_embed.shape[0])
+                    ending_ids = attn_metadata.seq_lens  # list
+                    # First sequence: from starting_idx[0] to ending_ids[0]
+                    cumulative_starts = [starting_idx[0]]  # First starts at its own index
+                    cumulative_ends = [ending_ids[0]]  # First ends at its ending_id
+                    # For subsequent sequences:
+                    # Start = previous_end + current_start
+                    # End = previous_end + current_end
+                    for i in range(1, len(starting_idx)):
+                        next_start = cumulative_ends[i - 1] + starting_idx[i]
+                        next_end = cumulative_ends[i - 1] + ending_ids[i]
+                        cumulative_starts.append(next_start)
+                        cumulative_ends.append(next_end)
+                    ids_for_unpacking = [end-start for start, end in zip(cumulative_starts, cumulative_ends)]
+                    input_ids = torch.cat([
+                        input_ids[start:end].reshape(1, -1)
+                        for start, end in zip(cumulative_starts, cumulative_ends)
+                    ], dim=-1)
+                    position_ids = torch.cat([
+                        position_ids[start:end].reshape(1, -1)
+                        for start, end in zip(cumulative_starts, cumulative_ends)
+                    ], dim= -1).squeeze(0)
+                else:
+                    input_ids = input_ids[input_embeds.shape[1]:].reshape(1, -1)
+                    position_ids = position_ids[input_embeds.shape[1]:]#.reshape(1, -1)
+            else:
+                input_ids = input_ids
+            audio_inputs_embeds = self.wte(input_ids).squeeze(0)
+            # weird but they to it like this in the xtts2 model
+            position_embeds = self.wpe.get_fixed_embedding(
+                    position_ids, input_ids.device
+            ) if not is_first_iteration \
+                    else self.wpe(audio_inputs_embeds.reshape(-1, 1)) # we need to reshape to 2D tensor or useless?
+            hidden_states = audio_inputs_embeds + position_embeds
+            if isinstance(input_embeds, list) and is_logits_only_mode:
+                hidden_states = list(hidden_states.split(ids_for_unpacking, dim=0))
+            if is_first_iteration or is_logits_only_mode:
+                # We concat the text and audio conditioning input in the sequence dimension
+                if isinstance(input_embeds, list):
+                    input_embeds = [input_embed.view(-1, input_embed.shape[-1]) for input_embed in input_embeds]
+                else:
+                    input_embeds = input_embeds.view(-1, input_embeds.shape[-1]) # we ensure we have a 2D tensor
+                if not isinstance(input_embeds, list) and input_embeds.shape[0] == attn_metadata.num_prefill_tokens:
+                    # this is during profiling, wee need to remove the last token
+                    # the attn_metadata.num_prefill_tokens(prompt len) should be == to input_embeds.shape[0] - 1
+                    # to account for the start audio gen embedding that will be cat to the text embeddings
+                    input_embeds = input_embeds[:-1]
+            if is_first_iteration or is_logits_only_mode:
+                # we concatenate the conditioning input to the text conditioning input
+                if isinstance(input_embeds, list):
+                        hidden_states = torch.cat([
+                                tensor for pair in zip(input_embeds, [hidden_states] * len(input_embeds)
+                                                    if not isinstance(hidden_states, list) else hidden_states)
+                                for tensor in pair
+                            ], dim=0)
+                else:
+                    hidden_states = torch.cat([input_embeds, hidden_states], dim=0)
+            #flatten the hidden state
+            hidden_states = hidden_states.view(-1, self.embed_dim)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states,
+                                  kv_caches[i - self.start_layer],
+                                  attn_metadata)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states