Upload 8 files

Browse files

Files changed (8) hide show

config.json +142 -0
gpt_config.py +172 -0
special_tokens_map.json +6 -0
tokenizer.json +0 -0
tokenizer.py +233 -0
tokenizer_config.json +191 -0
xtts2_gpt_modeling.py +312 -0
xttsv2-gpt.safetensors +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,142 @@

+{
+ "_name_or_path": "AstraMindAI/xtts2-gpt",
+  "architectures": [
+    "XttsGPT"
+  ],
+  "auto_map": {
+    "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+    "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT"
+  },
+  "audio_config": {
+    "fmax": 8000,
+    "fmin": 0,
+    "hop_length": 256,
+    "mel_channels": 80,
+    "mel_norms_file": null,
+    "n_fft": 1024,
+    "output_sample_rate": 24000,
+    "power": 1.0,
+    "sample_rate": 22050,
+    "win_length": 1024
+  },
+  "char_limits": {
+    "ar": 166,
+    "cs": 186,
+    "de": 253,
+    "en": 250,
+    "es": 239,
+    "fr": 273,
+    "hu": 224,
+    "it": 213,
+    "ja": 71,
+    "ko": 95,
+    "nl": 251,
+    "pl": 224,
+    "pt": 203,
+    "ru": 182,
+    "tr": 226,
+    "zh": 82
+  },
+  "duration_const": 102400,
+  "enable_redaction": false,
+  "gpt_batch_size": 1,
+  "gpt_checkpointing": false,
+  "gpt_code_stride_len": 1024,
+  "gpt_cond_chunk_len": 4,
+  "gpt_cond_len": 30,
+  "gpt_layers": 30,
+  "gpt_max_audio_tokens": 605,
+  "gpt_max_prompt_tokens": 70,
+  "gpt_max_text_tokens": 402,
+  "gpt_n_heads": 16,
+  "gpt_n_model_channels": 1024,
+  "gpt_num_audio_tokens": 1026,
+  "gpt_number_text_tokens": 6681,
+  "gpt_start_audio_token": 1024,
+  "gpt_start_text_token": null,
+  "gpt_stop_audio_token": 1025,
+  "gpt_stop_text_token": null,
+  "gpt_train_solo_embeddings": false,
+  "gpt_use_masking_gt_prompt_approach": true,
+  "gpt_use_perceiver_resampler": true,
+  "kv_cache": true,
+  "label_smoothing": 0.0,
+  "languages": [
+    "en",
+    "es",
+    "fr",
+    "de",
+    "it",
+    "pt",
+    "pl",
+    "tr",
+    "ru",
+    "nl",
+    "cs",
+    "ar",
+    "zh-cn",
+    "hu",
+    "ko",
+    "ja",
+    "hi"
+  ],
+  "max_ref_len": 30,
+  "model_type": "xtts_gpt",
+  "num_chars": 255,
+  "perceiver_cond_length_compression": 256,
+  "repetition_penalty": 5.0,
+  "sound_norm_refs": false,
+  "temperature": 0.75,
+  "top_p": 0.85,
+  "transformers_version": "4.45.1",
+  "vocab_size": 256,
+  "cond_d_vector_in_each_upsampling_layer": true,
+  "d_vector_dim": 512,
+  "decoder_input_dim": 1024,
+  "input_sample_rate": 22050,
+  "hifi_model_type": "xtts_hifigan",
+  "output_hop_length": 256,
+  "output_sample_rate": 24000,
+  "resblock_dilation_sizes": [
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ]
+  ],
+  "resblock_kernel_sizes": [
+    3,
+    7,
+    11
+  ],
+  "speaker_encoder_config": {
+    "model_config": null,
+    "model_name": "speaker_encoder",
+    "preprocess_config": null,
+    "speaker_embedding_dim": 512,
+    "use_torch_spec": true
+  },
+  "upsample_initial_channel": 512,
+  "upsample_kernel_sizes": [
+    16,
+    16,
+    4,
+    4
+  ],
+  "upsample_rates": [
+    8,
+    8,
+    2,
+    2
+  ]
+}

gpt_config.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from dataclasses import asdict, dataclass, field
+from typing import Dict, Optional, List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+@dataclass
+class XTTSAudioConfig:
+    """Configuration for audio processing parameters"""
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+    mel_channels: int = 80
+    hop_length: int = 256
+    win_length: int = 1024
+    n_fft: int = 1024
+    fmin: int = 0
+    fmax: int = 8000
+    power: float = 1.0
+    mel_norms_file: Optional[str] = None
+class XTTSGPTConfig(PretrainedConfig):
+    """Configuration class for the GPT component of XTTS"""
+    model_type = "xtts_gpt"
+    def __init__(
+            self,
+            # Model architecture
+            vocab_size: int = 256,
+            num_chars: int = 255,
+            # GPT parameters
+            gpt_batch_size: int = 1,
+            gpt_max_audio_tokens: int = 605,
+            gpt_max_text_tokens: int = 402,
+            gpt_max_prompt_tokens: int = 70,
+            gpt_layers: int = 30,
+            gpt_n_model_channels: int = 1024,
+            gpt_n_heads: int = 16,
+            gpt_number_text_tokens: int = 6681,
+            gpt_start_text_token: Optional[int] = None,
+            gpt_stop_text_token: Optional[int] = None,
+            gpt_num_audio_tokens: int = 1026,
+            gpt_start_audio_token: int = 1024,
+            gpt_stop_audio_token: int = 1025,
+            gpt_code_stride_len: int = 1024,
+            gpt_use_masking_gt_prompt_approach: bool = True,
+            gpt_use_perceiver_resampler: bool = True,
+            gpt_checkpointing: bool = False,
+            gpt_train_solo_embeddings: bool = False,
+            # Training parameters
+            enable_redaction: bool = False,
+            kv_cache: bool = True,
+            perceiver_cond_length_compression: int = 256,
+            label_smoothing: float = 0.0,
+            # Generation parameters
+            temperature: float = 0.75,
+            length_penalty: float = 1.0,
+            repetition_penalty: float = 5.0,
+            top_k: int = 50,
+            top_p: float = 0.85,
+            gpt_cond_len: int = 30,
+            gpt_cond_chunk_len: int = 4,
+            max_ref_len: int = 30,
+            sound_norm_refs: bool = False,
+            # Audio processing
+            audio_config: Optional[XTTSAudioConfig] = None,
+            # Constants and limits
+            duration_const: int = 102400,
+            char_limits: Optional[Dict[str, int]] = None,
+            languages: Optional[List[str]] = None,
+            pad_token_id: Optional[int] = None,
+            bos_token_id: Optional[int] = None,
+            eos_token_id: Optional[int] = None,
+            **kwargs,
+    ):
+        if char_limits is None:
+            char_limits = {
+                "en": 250, "de": 253, "fr": 273, "es": 239,
+                "it": 213, "pt": 203, "pl": 224, "zh": 82,
+                "ar": 166, "cs": 186, "ru": 182, "nl": 251,
+                "tr": 226, "ja": 71, "hu": 224, "ko": 95,
+            }
+        if languages is None:
+            languages = [
+                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
+                "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
+            ]
+        if audio_config is None:
+            audio_config = XTTSAudioConfig()
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+        self.vocab_size = vocab_size
+        self.num_chars = num_chars
+        # GPT parameters
+        self.gpt_batch_size = gpt_batch_size
+        self.gpt_max_audio_tokens = gpt_max_audio_tokens
+        self.gpt_max_text_tokens = gpt_max_text_tokens
+        self.gpt_max_prompt_tokens = gpt_max_prompt_tokens
+        self.gpt_layers = gpt_layers
+        self.gpt_n_model_channels = gpt_n_model_channels
+        self.gpt_n_heads = gpt_n_heads
+        self.gpt_number_text_tokens = gpt_number_text_tokens
+        self.gpt_start_text_token = gpt_start_text_token
+        self.gpt_stop_text_token = gpt_stop_text_token
+        self.gpt_num_audio_tokens = gpt_num_audio_tokens
+        self.gpt_start_audio_token = gpt_start_audio_token
+        self.gpt_stop_audio_token = gpt_stop_audio_token
+        self.gpt_code_stride_len = gpt_code_stride_len
+        self.gpt_use_masking_gt_prompt_approach = gpt_use_masking_gt_prompt_approach
+        self.gpt_use_perceiver_resampler = gpt_use_perceiver_resampler
+        self.gpt_checkpointing = gpt_checkpointing
+        self.gpt_train_solo_embeddings = gpt_train_solo_embeddings
+        # Training parameters
+        self.enable_redaction = enable_redaction
+        self.kv_cache = kv_cache
+        self.perceiver_cond_length_compression = perceiver_cond_length_compression
+        self.label_smoothing = label_smoothing
+        # Generation parameters
+        self.temperature = temperature
+        self.length_penalty = length_penalty
+        self.repetition_penalty = repetition_penalty
+        self.top_k = top_k
+        self.top_p = top_p
+        self.gpt_cond_len = gpt_cond_len
+        self.gpt_cond_chunk_len = gpt_cond_chunk_len
+        self.max_ref_len = max_ref_len
+        self.sound_norm_refs = sound_norm_refs
+        # Audio processing
+        self.audio_config = audio_config
+        # Constants and limits
+        self.duration_const = duration_const
+        self.char_limits = char_limits
+        self.languages = languages
+    def to_dict(self):
+        """Convert config to dictionary"""
+        config_dict = super().to_dict()
+        config_dict["audio_config"] = asdict(self.audio_config)
+        return config_dict
+    @classmethod
+    def from_dict(cls, config_dict):
+        """Create config from dictionary"""
+        audio_config = XTTSAudioConfig(**config_dict.pop("audio_config", {}))
+        return cls(audio_config=audio_config, **config_dict)
+    def update_with_tokenizer(self, tokenizer=None):
+        """Update configuration values based on tokenizer"""
+        if tokenizer is not None:
+            self.gpt_number_text_tokens = tokenizer.get_vocab_size()
+            self.gpt_start_text_token = tokenizer.bos_token_id
+            self.gpt_stop_text_token = tokenizer.eos_token_id

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[START]",
+  "eos_token": "[STOP]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from typing import List, Optional, Union, Dict, Tuple, Any
+import os
+from functools import cached_property
+from transformers import PreTrainedTokenizerFast
+from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
+from tokenizers import Tokenizer, processors
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from tokenizers.processors import TemplateProcessing
+import torch
+from hangul_romanize import Transliter
+from hangul_romanize.rule import academic
+import cutlet
+from TTS.tts.layers.xtts.tokenizer import (multilingual_cleaners, basic_cleaners,
+                                          chinese_transliterate, korean_transliterate,
+                                          japanese_cleaners)
+class XTTSTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
+    """
+    def __init__(
+            self,
+            vocab_file: str = None,
+            tokenizer_object: Optional[Tokenizer] = None,
+            unk_token: str = "[UNK]",
+            pad_token: str = "[PAD]",
+            bos_token: str = "[START]",
+            eos_token: str = "[STOP]",
+            clean_up_tokenization_spaces: bool = True,
+            **kwargs
+    ):
+        if tokenizer_object is None and vocab_file is not None:
+            tokenizer_object = Tokenizer.from_file(vocab_file)
+        if tokenizer_object is not None:
+            # Configure the tokenizer
+            tokenizer_object.pre_tokenizer = WhitespaceSplit()
+            tokenizer_object.enable_padding(
+                direction='right',
+                pad_id=tokenizer_object.token_to_id(pad_token) or 0,
+                pad_token=pad_token
+            )
+            tokenizer_object.post_processor = TemplateProcessing(
+                single=f"{bos_token} $A {eos_token}",
+                special_tokens=[
+                    (bos_token, tokenizer_object.token_to_id(bos_token)),
+                    (eos_token, tokenizer_object.token_to_id(eos_token)),
+                ],
+            )
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs
+        )
+        # Character limits per language
+        self.char_limits = {
+            "en": 250, "de": 253, "fr": 273, "es": 239,
+            "it": 213, "pt": 203, "pl": 224, "zh": 82,
+            "ar": 166, "cs": 186, "ru": 182, "nl": 251,
+            "tr": 226, "ja": 71, "hu": 224, "ko": 95,
+        }
+        # Initialize language tools
+        self._katsu = None
+        self._korean_transliter = Transliter(academic)
+    @cached_property
+    def katsu(self):
+        if self._katsu is None:
+            self._katsu = cutlet.Cutlet()
+        return self._katsu
+    def check_input_length(self, text: str, lang: str):
+        """Check if input text length is within limits for language"""
+        lang = lang.split("-")[0]  # remove region
+        limit = self.char_limits.get(lang, 250)
+        if len(text) > limit:
+            print(f"Warning: Text length exceeds {limit} char limit for '{lang}', may cause truncation.")
+    def preprocess_text(self, text: str, lang: str) -> str:
+        """Apply text preprocessing for language"""
+        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
+                   "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+            text = multilingual_cleaners(text, lang)
+            if lang == "zh":
+                text = chinese_transliterate(text)
+            if lang == "ko":
+                text = korean_transliterate(text)
+        elif lang == "ja":
+            text = japanese_cleaners(text, self.katsu)
+        else:
+            text = basic_cleaners(text)
+        return text
+    def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs,
+            add_special_tokens: bool = True,
+            padding_strategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = 402,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Override batch encoding to handle language-specific preprocessing
+        """
+        lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
+        if isinstance(lang, str):
+            lang = [lang] * len(batch_text_or_text_pairs)
+        # Preprocess each text in the batch with its corresponding language
+        processed_texts = []
+        for text, text_lang in zip(batch_text_or_text_pairs, lang):
+            if isinstance(text, str):
+                # Check length and preprocess
+                self.check_input_length(text, text_lang)
+                processed_text = self.preprocess_text(text, text_lang)
+                # Format text with language tag and spaces
+                lang_code = "zh-cn" if text_lang == "zh" else text_lang
+                processed_text = f"[{lang_code}]{processed_text}"
+                processed_text = processed_text.replace(" ", "[SPACE]")
+                processed_texts.append(processed_text)
+            else:
+                processed_texts.append(text)
+        # Call the parent class's encoding method with processed texts
+        return super()._batch_encode_plus(
+            processed_texts,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs
+        )
+    def __call__(
+            self,
+            text: Union[str, List[str]],
+            lang: Union[str, List[str]] = "en",
+            add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = True,  # Changed default to True
+            truncation: Union[bool, str, TruncationStrategy] = True,  # Changed default to True
+            max_length: Optional[int] = 402,
+            stride: int = 0,
+            return_tensors: Optional[str] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = True,  # Changed default to True
+            **kwargs
+    ):
+        """
+        Main tokenization method
+        Args:
+            text: Text or list of texts to tokenize
+            lang: Language code or list of language codes corresponding to each text
+            add_special_tokens: Whether to add special tokens
+            padding: Padding strategy (default True)
+            truncation: Truncation strategy (default True)
+            max_length: Maximum length
+            stride: Stride for truncation
+            return_tensors: Format of output tensors ("pt" for PyTorch)
+            return_token_type_ids: Whether to return token type IDs
+            return_attention_mask: Whether to return attention mask (default True)
+        """
+        # Convert single string to list for batch processing
+        if isinstance(text, str):
+            text = [text]
+            if isinstance(lang, str):
+                lang = [lang]
+        # Ensure text and lang lists have same length
+        if len(text) != len(lang):
+            raise ValueError(f"Number of texts ({len(text)}) must match number of language codes ({len(lang)})")
+        # Convert padding strategy
+        if isinstance(padding, bool):
+            padding_strategy = PaddingStrategy.MAX_LENGTH if padding else PaddingStrategy.DO_NOT_PAD
+        else:
+            padding_strategy = PaddingStrategy(padding)
+        # Convert truncation strategy
+        if isinstance(truncation, bool):
+            truncation_strategy = TruncationStrategy.LONGEST_FIRST if truncation else TruncationStrategy.DO_NOT_TRUNCATE
+        else:
+            truncation_strategy = TruncationStrategy(truncation)
+        # Use the batch encoding method
+        encoded = self._batch_encode_plus(
+            text,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            lang=lang,
+            **kwargs
+        )
+        return encoded

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[STOP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SPACE]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "259": {
+      "content": "[en]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "260": {
+      "content": "[de]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "261": {
+      "content": "[START]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "262": {
+      "content": "[fr]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "267": {
+      "content": "[ru]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "284": {
+      "content": "[es]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "285": {
+      "content": "[it]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "286": {
+      "content": "[pt]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "293": {
+      "content": "[cs]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "294": {
+      "content": "[pl]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "295": {
+      "content": "[tr]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "297": {
+      "content": "[nl]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5022": {
+      "content": "[ar]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5023": {
+      "content": "[zh-cn]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5412": {
+      "content": "[ja]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5753": {
+      "content": "[hu]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6152": {
+      "content": "[ko]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6680": {
+      "content": "[hi]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6681": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[START]",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "[STOP]",
+  "max_length": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "tokenizer_class": "XTTSTokenizer",
+  "unk_token": "[UNK]"
+}

xtts2_gpt_modeling.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import functools
+import math
+from array import array
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import List, Optional, Union, Iterable, Tuple, Mapping
+from transformers import PretrainedConfig
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import get_pp_group
+from vllm.inputs import InputContext, INPUT_REGISTRY
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.gpt2 import GPT2Block
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.sequence import IntermediateTensors, SequenceData, VLLM_TOKEN_ID_ARRAY_TYPE
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder # noqa
+from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler # noqa
+from TTS.TTS.tts.layers.xtts.gpt import LearnedPositionEmbeddings
+# Constants for token calculation
+_AUDIO_PLACEHOLDER_TOKEN = 8192  # Using XTTS start_audio_token as placeholder
+_AUDIO_TOKENS_PER_SECOND = 6.25
+_CODE_STRIDE_LEN = 1024
+def get_xtts_max_audio_tokens(ctx: InputContext) -> int:
+    """Calculate maximum audio tokens based on text context and audio duration."""
+    # Based on GPT config and common XTTS settings
+    text_context = ctx.model_config.max_seq_len - 100  # Reserve space for text
+    # Allow for ~30 seconds of audio (similar to whisper chunks)
+    max_audio_duration = 30.0
+    audio_tokens = math.ceil(max_audio_duration * _AUDIO_TOKENS_PER_SECOND)
+    total_tokens = text_context + audio_tokens + 4  # +4 for special tokens
+    return min(total_tokens, 1000)  # Cap at 1000 tokens as specified
+def dummy_seq_data_for_xtts(
+        ctx: InputContext,
+        seq_len: int,
+        audio_count: int,
+) -> SequenceData:
+    """Create dummy sequence data for XTTS profiling."""
+    # Calculate audio token space needed
+    audio_len_tokens = math.ceil(_AUDIO_TOKENS_PER_SECOND * 5)  # Assume 5s per chunk
+    audio_placeholder = array(
+        VLLM_TOKEN_ID_ARRAY_TYPE,
+        [_AUDIO_PLACEHOLDER_TOKEN]
+    ) * audio_len_tokens
+    # Add separator between chunks
+    audio_token_ids = (audio_placeholder + array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count
+    # Fill remaining sequence with padding
+    other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * (seq_len - len(audio_token_ids))
+    return SequenceData(audio_token_ids + other_token_ids)
+def dummy_conditioning_for_xtts(
+        ctx: InputContext,
+        audio_count: int,
+) -> dict:
+    """Create dummy conditioning data for XTTS."""
+    return {
+        "cond_latents": [(torch.zeros(80, 1024), 22050) for _ in range(audio_count)]
+    }
+def dummy_data_for_xtts(
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+) -> Tuple[SequenceData, dict]:
+    """Create complete dummy data for XTTS profiling."""
+    audio_count = mm_counts["audio"]
+    seq_data = dummy_seq_data_for_xtts(ctx, seq_len, audio_count)
+    cond_data = dummy_conditioning_for_xtts(ctx, audio_count)
+    return (seq_data, cond_data)
+def input_mapper_for_xtts(ctx: InputContext, data: object) -> MultiModalInputs:
+    """Map input data to XTTS format."""
+    if not isinstance(data, list):
+        data = [data]
+    # Each item should be a tuple of (mel_spec, sample_rate)
+    for audio_input in data:
+        if not isinstance(audio_input, tuple):
+            raise NotImplementedError(f"Unsupported data type: {type(audio_input)}")
+    return MultiModalInputs({"cond_latents": data})
+@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_xtts)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("audio", get_xtts_max_audio_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_xtts)
+class XttsGPT(nn.Module, SupportsMultiModal):
+    def __init__(
+            self,
+            config: PretrainedConfig,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional["QuantizationConfig"] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        # XTTS specific components
+        self.conditioning_encoder = ConditioningEncoder(
+            80, config.n_embd, num_attn_heads=config.n_head
+        )
+        if config.use_perceiver_resampler:
+            self.conditioning_perceiver = PerceiverResampler(
+                dim=config.n_embd,
+                depth=2,
+                dim_context=config.n_embd,
+                num_latents=32,
+                dim_head=64,
+                heads=8,
+                ff_mult=4,
+                use_flash_attn=False,
+            )
+        # Core GPT components following VLLM pattern
+        self.gpt = XttsGPT2Model(
+            config,
+            cache_config,
+            quant_config,
+            prefix="gpt"
+        )
+        # Prediction heads
+        self.text_head = ColumnParallelLinear(
+            config.n_embd,
+            config.vocab_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix="text_head"
+        )
+        self.mel_head = ColumnParallelLinear(
+            config.n_embd,
+            config.num_audio_tokens,
+            bias=False,
+            quant_config=quant_config,
+            prefix="mel_head"
+        )
+        self.sampler = Sampler()
+    def get_style_emb(self, cond_input: torch.Tensor, return_latent: bool = False) -> torch.Tensor:
+        """Get conditioning embeddings from mel spectrograms."""
+        if not return_latent:
+            if cond_input.ndim == 4:
+                cond_input = cond_input.squeeze(1)
+            conds = self.conditioning_encoder(cond_input)
+            if hasattr(self, 'conditioning_perceiver'):
+                conds = self.conditioning_perceiver(
+                    conds.permute(0, 2, 1)
+                ).transpose(1, 2)
+        else:
+            conds = cond_input.unsqueeze(1)
+        return conds
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            cond_latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass following VLLM pattern."""
+        if cond_latents is not None:
+            # Combine conditioning with input embeddings
+            input_embeds = self.gpt.get_input_embeddings()(input_ids)
+            combined_embeds = torch.cat([cond_latents, input_embeds], dim=1)
+            hidden_states = self.gpt(
+                inputs_embeds=combined_embeds,
+                positions=positions,
+                kv_caches=kv_caches,
+                attn_metadata=attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+            )
+        else:
+            hidden_states = self.gpt(
+                input_ids=input_ids,
+                positions=positions,
+                kv_caches=kv_caches,
+                attn_metadata=attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+            )
+        return hidden_states
+    def compute_logits( # useless but kept for compatibility
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        """Compute output logits."""
+        text_logits = self.text_head(hidden_states[sampling_metadata.selected_token_indices])
+        mel_logits = self.mel_head(hidden_states[sampling_metadata.selected_token_indices])
+        return torch.cat([text_logits, mel_logits], dim=1)
+    def sample(
+            self,
+            logits: torch.Tensor,
+            sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        """Sample next tokens using VLLM sampler."""
+        return self.sampler(logits, sampling_metadata)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights following VLLM pattern."""
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            if "c_attn" in name or "c_proj" in name or "c_fc" in name:
+                if name.endswith(".weight"):
+                    loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+class XttsGPT2Model(nn.Module):
+    """VLLM-style implementation of GPT2 core architecture."""
+    def __init__(
+            self,
+            config: PretrainedConfig,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional["QuantizationConfig"] = None,
+            prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.text_embedding = VocabParallelEmbedding(config.number_text_tokens, config.n_embd)
+        self.mel_embedding = VocabParallelEmbedding(config.num_audio_tokens, config.n_embd)
+        self.text_pos_embedding = (
+            LearnedPositionEmbeddings(config.max_text_seq_len, config.n_embd)
+            if config.max_mel_seq_len != -1
+            else functools.partial(config.null_position_embeddings, dim=config.n_embd)
+        )
+        self.mel_pos_embedding = (
+            LearnedPositionEmbeddings(config.max_mel_seq_len, config.n_embd)
+            if config.max_mel_seq_len != -1
+            else functools.partial(config.null_position_embeddings, dim=config.n_embd)
+        )
+        # Build gpt blocks
+        self.h = nn.ModuleList([
+            GPT2Block(
+                config,
+                cache_config,
+                quant_config,
+                prefix=f"{prefix}.h.{i}"
+            ) for i in range(config.num_hidden_layers)
+        ])
+        self.final_norm = nn.LayerNorm(
+            config.n_embd,
+            eps=config.layer_norm_epsilon
+        )
+    def forward( # TODO: this is not correct, allieeate it with the correct implementation
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            inputs_embeds = self.wte(input_ids)
+            position_embeds = self.wpe(position_ids)
+            hidden_states = inputs_embeds + position_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states,
+                                  kv_caches[i - self.start_layer],
+                                  attn_metadata)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states

xttsv2-gpt.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93fa43aaad29e232fa6c85f3d6c3285285c1fe4c89f9505d8153e231b12e1a50
+size 1764117740