{ "acoustic_text_encoder_config": { "_attn_implementation_autoset": true, "model_type": "acoustic_text_encoder" }, "architectures": [ "StyleTextToSpeech2ForConditionalGeneration" ], "decoder_config": { "_attn_implementation_autoset": true, "model_type": "decoder" }, "duration_encoder_config": { "_attn_implementation_autoset": true, "model_type": "duration_encoder" }, "duration_predictor_config": { "_attn_implementation_autoset": true, "model_type": "duration_predictor" }, "initializer_range": 0.02, "model_type": "style_text_to_speech_2", "prosodic_text_encoder_config": { "_attn_implementation_autoset": true, "bert_config": { "dropout": 0.1, "hidden_size": 768, "intermediate_size": 2048, "num_attention_heads": 12, "vocab_size": 178 }, "model_type": "prosodic_encoder" }, "prosody_predictor_config": { "_attn_implementation_autoset": true, "model_type": "prosody_predictor" }, "torch_dtype": "float32", "transformers_version": "4.49.0.dev0" }