Spaces:
Running
Running
Upload 2 files
Browse files- app.py +238 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import string
|
3 |
+
import gradio as gr
|
4 |
+
from hailuo_tts import HailuoTTS
|
5 |
+
import os
|
6 |
+
|
7 |
+
# Global variable to store TTS instance
|
8 |
+
tts_instance = None
|
9 |
+
|
10 |
+
def authorize(api_key, group_id):
|
11 |
+
"""Authorization function and TTS instance creation"""
|
12 |
+
global tts_instance
|
13 |
+
try:
|
14 |
+
tts_instance = HailuoTTS.create(api_key=api_key, group_id=group_id)
|
15 |
+
return gr.update(visible=True), gr.update(visible=False)
|
16 |
+
except Exception as e:
|
17 |
+
return gr.update(visible=False), gr.update(visible=True, value=f"Authorization error: {str(e)}")
|
18 |
+
|
19 |
+
def on_model_change(model):
|
20 |
+
"""Interface update when model changes"""
|
21 |
+
show_emotions = model == "turbo"
|
22 |
+
return gr.update(visible=show_emotions)
|
23 |
+
|
24 |
+
def text_to_speech(text, model, voice, speed, volume, pitch, emotion, language,
|
25 |
+
sample_rate, bitrate, audio_format, channel):
|
26 |
+
"""Text to speech generation function"""
|
27 |
+
global tts_instance
|
28 |
+
try:
|
29 |
+
# Update settings
|
30 |
+
tts_instance.set_model(model)
|
31 |
+
tts_instance.set_voice(voice)
|
32 |
+
tts_instance.set_voice_params(speed=float(speed), volume=float(volume), pitch=int(pitch))
|
33 |
+
|
34 |
+
if model == "turbo" and emotion:
|
35 |
+
tts_instance.set_emotion(emotion)
|
36 |
+
|
37 |
+
if language != "auto":
|
38 |
+
tts_instance.set_language_boost(language)
|
39 |
+
|
40 |
+
# Update audio settings
|
41 |
+
tts_instance.update_audio_settings(
|
42 |
+
sample_rate=int(sample_rate),
|
43 |
+
bitrate=int(bitrate),
|
44 |
+
format=audio_format,
|
45 |
+
channel=int(channel)
|
46 |
+
)
|
47 |
+
|
48 |
+
# Generate speech
|
49 |
+
output_path = f"output.{audio_format}"
|
50 |
+
tts_instance.text_to_speech(text, output_path)
|
51 |
+
|
52 |
+
return output_path, "Audio generated successfully!"
|
53 |
+
except Exception as e:
|
54 |
+
return None, f"Error: {str(e)}"
|
55 |
+
|
56 |
+
def generate_random_voice_id():
|
57 |
+
return "random_" + ''.join(random.choices(string.ascii_letters + string.digits, k=12))
|
58 |
+
|
59 |
+
def show_voice_id_input(use_custom_voice_id):
|
60 |
+
return gr.update(visible=not use_custom_voice_id)
|
61 |
+
|
62 |
+
def clone_voice(audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id):
|
63 |
+
"""Voice cloning function"""
|
64 |
+
global tts_instance
|
65 |
+
try:
|
66 |
+
# Upload file
|
67 |
+
file_id = tts_instance.upload_voice_file(audio_file.name)
|
68 |
+
|
69 |
+
voice_id = voice_id if not use_custom_voice_id else generate_random_voice_id()
|
70 |
+
print(voice_id)
|
71 |
+
|
72 |
+
# Clone voice
|
73 |
+
response, demo_path = tts_instance.clone_voice(
|
74 |
+
file_id=file_id,
|
75 |
+
voice_id=voice_id,
|
76 |
+
noise_reduction=noise_reduction,
|
77 |
+
preview_text=preview_text,
|
78 |
+
accuracy=float(accuracy),
|
79 |
+
volume_normalize=volume_normalize
|
80 |
+
)
|
81 |
+
|
82 |
+
return demo_path, f"Voice cloned successfully! Voice ID: {voice_id}"
|
83 |
+
except Exception as e:
|
84 |
+
return None, f"Error: {str(e)}"
|
85 |
+
|
86 |
+
# Create interface
|
87 |
+
with gr.Blocks() as app:
|
88 |
+
# Authorization screen
|
89 |
+
with gr.Accordion("Authorization", open=True):
|
90 |
+
gr.Markdown("""
|
91 |
+
# Hailio TTS - Text-to-Speech Service
|
92 |
+
|
93 |
+
## Important Links
|
94 |
+
1. List of supported languages: https://www.hailuo.ai/audio
|
95 |
+
2. Get your API credentials:
|
96 |
+
- Group ID and API Key can be found at:
|
97 |
+
- https://intl.minimaxi.com/user-center/basic-information
|
98 |
+
- https://intl.minimaxi.com/user-center/basic-information/interface-key
|
99 |
+
|
100 |
+
## Pricing
|
101 |
+
- Turbo Model: $50 per 1M characters
|
102 |
+
- HD Model: $30 per 1M characters
|
103 |
+
- Voice Cloning:
|
104 |
+
- Verified voice clone: $3 per voice
|
105 |
+
- Unverified voice clone: Free
|
106 |
+
""")
|
107 |
+
with gr.Row(visible=True) as auth_row:
|
108 |
+
with gr.Column():
|
109 |
+
api_key = gr.Textbox(label="API Key",type="password", placeholder="Enter your API key")
|
110 |
+
group_id = gr.Textbox(label="Group ID",type="password", placeholder="Enter your Group ID")
|
111 |
+
auth_btn = gr.Button("Authorize")
|
112 |
+
auth_error = gr.Textbox(label="Status", interactive=False)
|
113 |
+
|
114 |
+
# Main interface (initially hidden)
|
115 |
+
with gr.Tabs(visible=False) as tabs:
|
116 |
+
# TTS tab
|
117 |
+
|
118 |
+
with gr.Tab("Text to Speech"):
|
119 |
+
with gr.Row():
|
120 |
+
with gr.Column():
|
121 |
+
# Main parameters
|
122 |
+
text_input = gr.Textbox(label="Text", placeholder="Enter text for speech", lines=5)
|
123 |
+
model = gr.Dropdown(choices=["turbo", "hd"], value="hd",info="Emotions work only with turbo model", label="Model")
|
124 |
+
voice = gr.Dropdown(choices=HailuoTTS.VOICES, allow_custom_value=True, value="Friendly_Person", label="VoiceId", info="You can set a custom value here, for example you can specify the voice ID that you cloned in another tab, but keep in mind the note written in clone voice")
|
125 |
+
|
126 |
+
with gr.Row():
|
127 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed")
|
128 |
+
volume = gr.Slider(minimum=0, maximum=10, value=1.0, label="Volume")
|
129 |
+
pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch")
|
130 |
+
|
131 |
+
# Additional parameters
|
132 |
+
emotion = gr.Dropdown(choices=HailuoTTS.EMOTIONS, label="Emotion", visible=False)
|
133 |
+
language = gr.Dropdown(choices=HailuoTTS.SUPPORTED_LANGUAGES, value="auto", label="Language Boost",info="Language Boost increases the accuracy of the voice, but only work with supported languages")
|
134 |
+
|
135 |
+
# Audio settings in accordion
|
136 |
+
with gr.Accordion("Audio Settings", open=True):
|
137 |
+
with gr.Row():
|
138 |
+
sample_rate = gr.Radio(
|
139 |
+
choices=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"],
|
140 |
+
value=HailuoTTS.AUDIO_CONSTRAINTS["sample_rate"][-1],
|
141 |
+
label="Sample Rate"
|
142 |
+
)
|
143 |
+
bitrate = gr.Radio(
|
144 |
+
choices=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"],
|
145 |
+
value=HailuoTTS.AUDIO_CONSTRAINTS["bitrate"][-1],
|
146 |
+
label="Bitrate"
|
147 |
+
)
|
148 |
+
with gr.Row():
|
149 |
+
audio_format = gr.Radio(
|
150 |
+
choices=HailuoTTS.AUDIO_CONSTRAINTS["format"],
|
151 |
+
value=HailuoTTS.AUDIO_CONSTRAINTS["format"][0],
|
152 |
+
label="Format"
|
153 |
+
)
|
154 |
+
channel = gr.Radio(
|
155 |
+
choices=HailuoTTS.AUDIO_CONSTRAINTS["channel"],
|
156 |
+
value=HailuoTTS.AUDIO_CONSTRAINTS["channel"][0],
|
157 |
+
label="Channels"
|
158 |
+
)
|
159 |
+
|
160 |
+
# Generation button and output
|
161 |
+
with gr.Column():
|
162 |
+
tts_output = gr.Audio(label="Result")
|
163 |
+
tts_status = gr.Textbox(label="Status", interactive=False)
|
164 |
+
tts_btn = gr.Button("Generate")
|
165 |
+
|
166 |
+
# Clone Voice tab
|
167 |
+
with gr.Tab("Clone Voice"):
|
168 |
+
gr.Markdown("""
|
169 |
+
### File Requirements:
|
170 |
+
- Formats: MP3, M4A, WAV
|
171 |
+
- Duration: 10s to 5min
|
172 |
+
- Size: Less than 20MB
|
173 |
+
- Quality: Clear voice recording with minimal background noise
|
174 |
+
- Content: Natural speech in any language
|
175 |
+
""")
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column():
|
179 |
+
# Cloning parameters
|
180 |
+
audio_file = gr.File(label="Audio File", file_types=["audio"])
|
181 |
+
use_custom_voice_id = gr.Checkbox(label="Random Voice ID",value=True,info="If you check this checkbox, you will be able to use a custom voice ID")
|
182 |
+
voice_id = gr.Textbox(label="Voice ID",visible=False, placeholder="Minimum 8 characters, letters and numbers,first letter must be a letter")
|
183 |
+
|
184 |
+
with gr.Row():
|
185 |
+
noise_reduction = gr.Checkbox(label="Noise Reduction", value=False)
|
186 |
+
volume_normalize = gr.Checkbox(label="Volume Normalization", value=False)
|
187 |
+
|
188 |
+
preview_text = gr.Textbox(label="Preview Text (max 300 characters)",max_length=300, value="Test voice", lines=2)
|
189 |
+
accuracy = gr.Slider(minimum=0, maximum=1, value=0.7, label="Accuracy")
|
190 |
+
|
191 |
+
with gr.Column():
|
192 |
+
clone_output = gr.Audio(label="Preview")
|
193 |
+
clone_status = gr.Textbox(label="Status", interactive=False)
|
194 |
+
clone_btn = gr.Button("Clone")
|
195 |
+
gr.Markdown("""
|
196 |
+
# Important Notes:
|
197 |
+
1. When you get a voice preview, it is synthesized using the turbo model.
|
198 |
+
2. You don't pay $3 for voice cloning. You only pay for synthesis.
|
199 |
+
3. You can copy the resulting ID and use it in the TTS tab. Please note that as soon as you use it at least once, you will be charged $3 for voice creation. It will be linked to your account. Make sure to save this ID somewhere to use it in TTS later.
|
200 |
+
4. Unverified voice cloning is free, but it life time is limited to 7 days.
|
201 |
+
""")
|
202 |
+
|
203 |
+
# Event handlers
|
204 |
+
auth_btn.click(
|
205 |
+
authorize,
|
206 |
+
inputs=[api_key, group_id],
|
207 |
+
outputs=[tabs, auth_error]
|
208 |
+
)
|
209 |
+
|
210 |
+
model.change(
|
211 |
+
on_model_change,
|
212 |
+
inputs=[model],
|
213 |
+
outputs=[emotion]
|
214 |
+
)
|
215 |
+
|
216 |
+
tts_btn.click(
|
217 |
+
text_to_speech,
|
218 |
+
inputs=[
|
219 |
+
text_input, model, voice, speed, volume, pitch, emotion, language,
|
220 |
+
sample_rate, bitrate, audio_format, channel
|
221 |
+
],
|
222 |
+
outputs=[tts_output, tts_status]
|
223 |
+
)
|
224 |
+
|
225 |
+
clone_btn.click(
|
226 |
+
clone_voice,
|
227 |
+
inputs=[audio_file, voice_id, noise_reduction, preview_text, accuracy, volume_normalize,use_custom_voice_id],
|
228 |
+
outputs=[clone_output, clone_status]
|
229 |
+
)
|
230 |
+
|
231 |
+
use_custom_voice_id.change(
|
232 |
+
show_voice_id_input,
|
233 |
+
inputs=[use_custom_voice_id],
|
234 |
+
outputs=[voice_id]
|
235 |
+
)
|
236 |
+
# Launch interface
|
237 |
+
if __name__ == "__main__":
|
238 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
uuid
|
3 |
+
gradio
|
4 |
+
hailuo-tts-api
|