File size: 10,404 Bytes
cf9596a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import random
import string
import gradio as gr
from hailuo_tts import HailuoTTS
import os

# Global variable to store TTS instance
tts_instance = None

def authorize(api_key, group_id):
    """Authorization function and TTS instance creation"""
    global tts_instance
    try:
        tts_instance = HailuoTTS.create(api_key=api_key, group_id=group_id)
        return gr.update(visible=True), gr.update(visible=False)
    except Exception as e:
        return gr.update(visible=False), gr.update(visible=True, value=f"Authorization error: {str(e)}")

def on_model_change(model):
    """Interface update when model changes"""
    show_emotions = model == "turbo"
    return gr.update(visible=show_emotions)

def text_to_speech(text, model, voice, speed, volume, pitch, emotion, language,

                  sample_rate, bitrate, audio_format, channel):
    """Text to speech generation function"""
    global tts_instance
    try:
        # Update settings
        tts_instance.set_model(model)
        tts_instance.set_voice(voice)
        tts_instance.set_voice_params(speed=float(speed), volume=float(volume), pitch=int(pitch))
        
        if model == "turbo" and emotion:
            tts_instance.set_emotion(emotion)
        
        if language != "auto":
            tts_instance.set_language_boost(language)
            
        # Update audio settings
        tts_instance.update_audio_settings(
            sample_rate=int(sample_rate),
            bitrate=int(bitrate),
            format=audio_format,
            channel=int(channel)
        )
            
        # Generate speech
        output_path = f"output.{audio_format}"
        tts_instance.text_to_speech(text, output_path)
        
        return output_path, "Audio generated successfully!"
    except Exception as e:
        return None, f"Error: {str(e)}"

def generate_random_voice_id():
    return "random_" + ''.join(random.choices(string.ascii_letters + string.digits, k=12))

def show_voice_id_input(use_custom_voice_id):
    return gr.update(visible=not use_custom_voice_id)

def clone_voice(audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id):
    """Voice cloning function"""
    global tts_instance
    try:
        # Upload file
        file_id = tts_instance.upload_voice_file(audio_file.name)

        voice_id = voice_id if not use_custom_voice_id else generate_random_voice_id()
        print(voice_id)
        
        # Clone voice
        response, demo_path = tts_instance.clone_voice(
            file_id=file_id,
            voice_id=voice_id,
            noise_reduction=noise_reduction,
            preview_text=preview_text,
            accuracy=float(accuracy),
            volume_normalize=volume_normalize
        )
        
        return demo_path, f"Voice cloned successfully! Voice ID: {voice_id}"
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create interface
with gr.Blocks() as app:
    # Authorization screen
    with gr.Accordion("Authorization", open=True):
        gr.Markdown("""

# Hailio TTS - Text-to-Speech Service



## Important Links

1. List of supported languages: https://www.hailuo.ai/audio

2. Get your API credentials:

   - Group ID and API Key can be found at:

   - https://intl.minimaxi.com/user-center/basic-information

   - https://intl.minimaxi.com/user-center/basic-information/interface-key



## Pricing

- Turbo Model: $50 per 1M characters

- HD Model: $30 per 1M characters

- Voice Cloning:

  - Verified voice clone: $3 per voice

  - Unverified voice clone: Free

""")
        with gr.Row(visible=True) as auth_row:
            with gr.Column():
                api_key = gr.Textbox(label="API Key",type="password", placeholder="Enter your API key")
                group_id = gr.Textbox(label="Group ID",type="password", placeholder="Enter your Group ID")
                auth_btn = gr.Button("Authorize")
                auth_error = gr.Textbox(label="Status", interactive=False)
    
    # Main interface (initially hidden)
    with gr.Tabs(visible=False) as tabs:
        # TTS tab

        with gr.Tab("Text to Speech"):
            with gr.Row():
                with gr.Column():
                    # Main parameters
                    text_input = gr.Textbox(label="Text", placeholder="Enter text for speech", lines=5)
                    model = gr.Dropdown(choices=["turbo", "hd"], value="hd",info="Emotions work only with turbo model", label="Model")
                    voice = gr.Dropdown(choices=HailuoTTS.VOICES, allow_custom_value=True, value="Friendly_Person", label="VoiceId", info="You can set a custom value here, for example you can specify the voice ID that you cloned in another tab, but keep in mind the note written in clone voice")

                    with gr.Row():
                        speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed")
                        volume = gr.Slider(minimum=0, maximum=10, value=1.0, label="Volume")
                        pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch")

                    # Additional parameters
                    emotion = gr.Dropdown(choices=HailuoTTS.EMOTIONS, label="Emotion", visible=False)
                    language = gr.Dropdown(choices=HailuoTTS.SUPPORTED_LANGUAGES, value="auto", label="Language Boost",info="Language Boost increases the accuracy of the voice, but only work with supported languages")

                    # Audio settings in accordion
                    with gr.Accordion("Audio Settings", open=True):
                        with gr.Row():
                            sample_rate = gr.Radio(
                                choices=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"],
                                value=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"][-1],
                                label="Sample Rate"
                            )
                            bitrate = gr.Radio(
                                choices=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"],
                                value=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"][-1],
                                label="Bitrate"
                            )
                        with gr.Row():
                            audio_format = gr.Radio(
                                choices=HailuoTTS.AUDIO_CONSTRAINTS["format"],
                                value=HailuoTTS.AUDIO_CONSTRAINTS["format"][0],
                                label="Format"
                            )
                            channel = gr.Radio(
                                choices=HailuoTTS.AUDIO_CONSTRAINTS["channel"],
                                value=HailuoTTS.AUDIO_CONSTRAINTS["channel"][0],
                                label="Channels"
                            )
                
                # Generation button and output
                with gr.Column():
                    tts_output = gr.Audio(label="Result")
                    tts_status = gr.Textbox(label="Status", interactive=False)
                    tts_btn = gr.Button("Generate")

        # Clone Voice tab
        with gr.Tab("Clone Voice"):
            gr.Markdown("""

            ### File Requirements:

            - Formats: MP3, M4A, WAV

            - Duration: 10s to 5min 

            - Size: Less than 20MB

            - Quality: Clear voice recording with minimal background noise

            - Content: Natural speech in any language

            """)
            
            with gr.Row():
                with gr.Column():
                    # Cloning parameters
                    audio_file = gr.File(label="Audio File", file_types=["audio"])
                    use_custom_voice_id = gr.Checkbox(label="Random Voice ID",value=True,info="If you check this checkbox, you will be able to use a custom voice ID")
                    voice_id = gr.Textbox(label="Voice ID",visible=False, placeholder="Minimum 8 characters, letters and numbers,first letter must be a letter")
                    
                    with gr.Row():
                        noise_reduction = gr.Checkbox(label="Noise Reduction", value=False)
                        volume_normalize = gr.Checkbox(label="Volume Normalization", value=False)
                
                    preview_text = gr.Textbox(label="Preview Text (max 300 characters)",max_length=300, value="Test voice", lines=2)
                    accuracy = gr.Slider(minimum=0, maximum=1, value=0.7, label="Accuracy")

                with gr.Column():
                    clone_output = gr.Audio(label="Preview")
                    clone_status = gr.Textbox(label="Status", interactive=False)
                    clone_btn = gr.Button("Clone")
            gr.Markdown("""

# Important Notes:

1. When you get a voice preview, it is synthesized using the turbo model.

2. You don't pay $3 for voice cloning. You only pay for synthesis.

3. You can copy the resulting ID and use it in the TTS tab. Please note that as soon as you use it at least once, you will be charged $3 for voice creation. It will be linked to your account. Make sure to save this ID somewhere to use it in TTS later.

4. Unverified voice cloning is free, but it life time is limited to 7 days.

""")
    
    # Event handlers
    auth_btn.click(
        authorize,
        inputs=[api_key, group_id],
        outputs=[tabs, auth_error]
    )
    
    model.change(
        on_model_change,
        inputs=[model],
        outputs=[emotion]
    )
    
    tts_btn.click(
        text_to_speech,
        inputs=[
            text_input, model, voice, speed, volume, pitch, emotion, language,
            sample_rate, bitrate, audio_format, channel
        ],
        outputs=[tts_output, tts_status]
    )
    
    clone_btn.click(
        clone_voice,
        inputs=[audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id],
        outputs=[clone_output, clone_status]
    )

    use_custom_voice_id.change(
        show_voice_id_input,
        inputs=[use_custom_voice_id],
        outputs=[voice_id]
    )
# Launch interface
if __name__ == "__main__":
    app.launch()