Hailio-TTS-API / app.py
daswer123's picture
Upload 2 files
cf9596a verified
import random
import string
import gradio as gr
from hailuo_tts import HailuoTTS
import os
# Global variable to store TTS instance
tts_instance = None
def authorize(api_key, group_id):
"""Authorization function and TTS instance creation"""
global tts_instance
try:
tts_instance = HailuoTTS.create(api_key=api_key, group_id=group_id)
return gr.update(visible=True), gr.update(visible=False)
except Exception as e:
return gr.update(visible=False), gr.update(visible=True, value=f"Authorization error: {str(e)}")
def on_model_change(model):
"""Interface update when model changes"""
show_emotions = model == "turbo"
return gr.update(visible=show_emotions)
def text_to_speech(text, model, voice, speed, volume, pitch, emotion, language,
sample_rate, bitrate, audio_format, channel):
"""Text to speech generation function"""
global tts_instance
try:
# Update settings
tts_instance.set_model(model)
tts_instance.set_voice(voice)
tts_instance.set_voice_params(speed=float(speed), volume=float(volume), pitch=int(pitch))
if model == "turbo" and emotion:
tts_instance.set_emotion(emotion)
if language != "auto":
tts_instance.set_language_boost(language)
# Update audio settings
tts_instance.update_audio_settings(
sample_rate=int(sample_rate),
bitrate=int(bitrate),
format=audio_format,
channel=int(channel)
)
# Generate speech
output_path = f"output.{audio_format}"
tts_instance.text_to_speech(text, output_path)
return output_path, "Audio generated successfully!"
except Exception as e:
return None, f"Error: {str(e)}"
def generate_random_voice_id():
return "random_" + ''.join(random.choices(string.ascii_letters + string.digits, k=12))
def show_voice_id_input(use_custom_voice_id):
return gr.update(visible=not use_custom_voice_id)
def clone_voice(audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id):
"""Voice cloning function"""
global tts_instance
try:
# Upload file
file_id = tts_instance.upload_voice_file(audio_file.name)
voice_id = voice_id if not use_custom_voice_id else generate_random_voice_id()
print(voice_id)
# Clone voice
response, demo_path = tts_instance.clone_voice(
file_id=file_id,
voice_id=voice_id,
noise_reduction=noise_reduction,
preview_text=preview_text,
accuracy=float(accuracy),
volume_normalize=volume_normalize
)
return demo_path, f"Voice cloned successfully! Voice ID: {voice_id}"
except Exception as e:
return None, f"Error: {str(e)}"
# Create interface
with gr.Blocks() as app:
# Authorization screen
with gr.Accordion("Authorization", open=True):
gr.Markdown("""
# Hailio TTS - Text-to-Speech Service
## Important Links
1. List of supported languages: https://www.hailuo.ai/audio
2. Get your API credentials:
- Group ID and API Key can be found at:
- https://intl.minimaxi.com/user-center/basic-information
- https://intl.minimaxi.com/user-center/basic-information/interface-key
## Pricing
- Turbo Model: $50 per 1M characters
- HD Model: $30 per 1M characters
- Voice Cloning:
- Verified voice clone: $3 per voice
- Unverified voice clone: Free
""")
with gr.Row(visible=True) as auth_row:
with gr.Column():
api_key = gr.Textbox(label="API Key",type="password", placeholder="Enter your API key")
group_id = gr.Textbox(label="Group ID",type="password", placeholder="Enter your Group ID")
auth_btn = gr.Button("Authorize")
auth_error = gr.Textbox(label="Status", interactive=False)
# Main interface (initially hidden)
with gr.Tabs(visible=False) as tabs:
# TTS tab
with gr.Tab("Text to Speech"):
with gr.Row():
with gr.Column():
# Main parameters
text_input = gr.Textbox(label="Text", placeholder="Enter text for speech", lines=5)
model = gr.Dropdown(choices=["turbo", "hd"], value="hd",info="Emotions work only with turbo model", label="Model")
voice = gr.Dropdown(choices=HailuoTTS.VOICES, allow_custom_value=True, value="Friendly_Person", label="VoiceId", info="You can set a custom value here, for example you can specify the voice ID that you cloned in another tab, but keep in mind the note written in clone voice")
with gr.Row():
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed")
volume = gr.Slider(minimum=0, maximum=10, value=1.0, label="Volume")
pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch")
# Additional parameters
emotion = gr.Dropdown(choices=HailuoTTS.EMOTIONS, label="Emotion", visible=False)
language = gr.Dropdown(choices=HailuoTTS.SUPPORTED_LANGUAGES, value="auto", label="Language Boost",info="Language Boost increases the accuracy of the voice, but only work with supported languages")
# Audio settings in accordion
with gr.Accordion("Audio Settings", open=True):
with gr.Row():
sample_rate = gr.Radio(
choices=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"],
value=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"][-1],
label="Sample Rate"
)
bitrate = gr.Radio(
choices=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"],
value=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"][-1],
label="Bitrate"
)
with gr.Row():
audio_format = gr.Radio(
choices=HailuoTTS.AUDIO_CONSTRAINTS["format"],
value=HailuoTTS.AUDIO_CONSTRAINTS["format"][0],
label="Format"
)
channel = gr.Radio(
choices=HailuoTTS.AUDIO_CONSTRAINTS["channel"],
value=HailuoTTS.AUDIO_CONSTRAINTS["channel"][0],
label="Channels"
)
# Generation button and output
with gr.Column():
tts_output = gr.Audio(label="Result")
tts_status = gr.Textbox(label="Status", interactive=False)
tts_btn = gr.Button("Generate")
# Clone Voice tab
with gr.Tab("Clone Voice"):
gr.Markdown("""
### File Requirements:
- Formats: MP3, M4A, WAV
- Duration: 10s to 5min
- Size: Less than 20MB
- Quality: Clear voice recording with minimal background noise
- Content: Natural speech in any language
""")
with gr.Row():
with gr.Column():
# Cloning parameters
audio_file = gr.File(label="Audio File", file_types=["audio"])
use_custom_voice_id = gr.Checkbox(label="Random Voice ID",value=True,info="If you check this checkbox, you will be able to use a custom voice ID")
voice_id = gr.Textbox(label="Voice ID",visible=False, placeholder="Minimum 8 characters, letters and numbers,first letter must be a letter")
with gr.Row():
noise_reduction = gr.Checkbox(label="Noise Reduction", value=False)
volume_normalize = gr.Checkbox(label="Volume Normalization", value=False)
preview_text = gr.Textbox(label="Preview Text (max 300 characters)",max_length=300, value="Test voice", lines=2)
accuracy = gr.Slider(minimum=0, maximum=1, value=0.7, label="Accuracy")
with gr.Column():
clone_output = gr.Audio(label="Preview")
clone_status = gr.Textbox(label="Status", interactive=False)
clone_btn = gr.Button("Clone")
gr.Markdown("""
# Important Notes:
1. When you get a voice preview, it is synthesized using the turbo model.
2. You don't pay $3 for voice cloning. You only pay for synthesis.
3. You can copy the resulting ID and use it in the TTS tab. Please note that as soon as you use it at least once, you will be charged $3 for voice creation. It will be linked to your account. Make sure to save this ID somewhere to use it in TTS later.
4. Unverified voice cloning is free, but it life time is limited to 7 days.
""")
# Event handlers
auth_btn.click(
authorize,
inputs=[api_key, group_id],
outputs=[tabs, auth_error]
)
model.change(
on_model_change,
inputs=[model],
outputs=[emotion]
)
tts_btn.click(
text_to_speech,
inputs=[
text_input, model, voice, speed, volume, pitch, emotion, language,
sample_rate, bitrate, audio_format, channel
],
outputs=[tts_output, tts_status]
)
clone_btn.click(
clone_voice,
inputs=[audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id],
outputs=[clone_output, clone_status]
)
use_custom_voice_id.change(
show_voice_id_input,
inputs=[use_custom_voice_id],
outputs=[voice_id]
)
# Launch interface
if __name__ == "__main__":
app.launch()