{ "_name_or_path": "AstraMindAI/xtts2-gpt", "architectures": [ "XttsGPT" ], "auto_map": { "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig", "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT", "AutoTokenizer": "AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast" }, "audio_config": { "fmax": 8000, "fmin": 0, "hop_length": 256, "mel_channels": 80, "mel_norms_file": null, "n_fft": 1024, "output_sample_rate": 24000, "power": 1.0, "sample_rate": 22050, "win_length": 1024 }, "char_limits": { "ar": 166, "cs": 186, "de": 253, "en": 250, "es": 239, "fr": 273, "hu": 224, "it": 213, "ja": 71, "ko": 95, "nl": 251, "pl": 224, "pt": 203, "ru": 182, "tr": 226, "zh": 82 }, "duration_const": 102400, "enable_redaction": false, "gpt_batch_size": 1, "gpt_checkpointing": false, "gpt_code_stride_len": 1024, "gpt_cond_chunk_len": 4, "gpt_cond_len": 30, "gpt_layers": 30, "gpt_max_audio_tokens": 605, "gpt_max_prompt_tokens": 70, "gpt_max_text_tokens": 402, "gpt_n_heads": 16, "gpt_n_model_channels": 1024, "gpt_num_audio_tokens": 1026, "gpt_number_text_tokens": 6681, "gpt_start_audio_token": 1024, "gpt_start_text_token": null, "gpt_stop_audio_token": 1025, "gpt_stop_text_token": null, "gpt_train_solo_embeddings": false, "gpt_use_masking_gt_prompt_approach": true, "gpt_use_perceiver_resampler": true, "kv_cache": true, "label_smoothing": 0.0, "languages": [ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi" ], "max_ref_len": 30, "model_type": "xtts_gpt", "num_chars": 255, "perceiver_cond_length_compression": 256, "repetition_penalty": 5.0, "sound_norm_refs": false, "temperature": 0.75, "top_p": 0.85, "transformers_version": "4.45.1", "vocab_size": 256, "cond_d_vector_in_each_upsampling_layer": true, "d_vector_dim": 512, "decoder_input_dim": 1024, "input_sample_rate": 22050, "hifi_model_type": "xtts_hifigan", "output_hop_length": 256, "output_sample_rate": 24000, "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "resblock_kernel_sizes": [ 3, 7, 11 ], "speaker_encoder_config": { "model_config": null, "model_name": "speaker_encoder", "preprocess_config": null, "speaker_embedding_dim": 512, "use_torch_spec": true }, "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 16, 16, 4, 4 ], "upsample_rates": [ 8, 8, 2, 2 ] }