Spaces:
Running
on
Zero
Running
on
Zero
mrfakename
commited on
Commit
•
bc38247
1
Parent(s):
cbc2a64
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +46 -19
- src/f5_tts/infer/utils_infer.py +13 -12
app.py
CHANGED
@@ -51,6 +51,8 @@ E2TTS_ema_model = load_model(
|
|
51 |
UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
|
52 |
)
|
53 |
|
|
|
|
|
54 |
chat_model_state = None
|
55 |
chat_tokenizer_state = None
|
56 |
|
@@ -129,7 +131,6 @@ with gr.Blocks() as app_tts:
|
|
129 |
gr.Markdown("# Batched TTS")
|
130 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
131 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
132 |
-
model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
133 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
134 |
with gr.Accordion("Advanced Settings", open=False):
|
135 |
ref_text_input = gr.Textbox(
|
@@ -162,13 +163,31 @@ with gr.Blocks() as app_tts:
|
|
162 |
audio_output = gr.Audio(label="Synthesized Audio")
|
163 |
spectrogram_output = gr.Image(label="Spectrogram")
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
generate_btn.click(
|
166 |
-
|
167 |
inputs=[
|
168 |
ref_audio_input,
|
169 |
ref_text_input,
|
170 |
gen_text_input,
|
171 |
-
model_choice,
|
172 |
remove_silence,
|
173 |
cross_fade_duration_slider,
|
174 |
speed_slider,
|
@@ -345,9 +364,6 @@ with gr.Blocks() as app_multistyle:
|
|
345 |
outputs=gen_text_input_multistyle,
|
346 |
)
|
347 |
|
348 |
-
# Model choice
|
349 |
-
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
350 |
-
|
351 |
with gr.Accordion("Advanced Settings", open=False):
|
352 |
remove_silence_multistyle = gr.Checkbox(
|
353 |
label="Remove Silences",
|
@@ -371,7 +387,6 @@ with gr.Blocks() as app_multistyle:
|
|
371 |
speech_type_names_list = args[:num_additional_speech_types]
|
372 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
373 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
374 |
-
model_choice = args[3 * num_additional_speech_types + 1]
|
375 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
376 |
|
377 |
# Collect the speech types and their audios into a dict
|
@@ -405,7 +420,7 @@ with gr.Blocks() as app_multistyle:
|
|
405 |
|
406 |
# Generate speech for this segment
|
407 |
audio, _ = infer(
|
408 |
-
ref_audio, ref_text, text,
|
409 |
) # show_info=print no pull to top when generating
|
410 |
sr, audio_data = audio
|
411 |
|
@@ -430,7 +445,6 @@ with gr.Blocks() as app_multistyle:
|
|
430 |
+ speech_type_audios
|
431 |
+ speech_type_ref_texts
|
432 |
+ [
|
433 |
-
model_choice_multistyle,
|
434 |
remove_silence_multistyle,
|
435 |
],
|
436 |
outputs=audio_output_multistyle,
|
@@ -518,11 +532,6 @@ Have a conversation with an AI using your reference voice!
|
|
518 |
ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
|
519 |
with gr.Column():
|
520 |
with gr.Accordion("Advanced Settings", open=False):
|
521 |
-
model_choice_chat = gr.Radio(
|
522 |
-
choices=["F5-TTS", "E2-TTS"],
|
523 |
-
label="TTS Model",
|
524 |
-
value="F5-TTS",
|
525 |
-
)
|
526 |
remove_silence_chat = gr.Checkbox(
|
527 |
label="Remove Silences",
|
528 |
value=True,
|
@@ -589,7 +598,7 @@ Have a conversation with an AI using your reference voice!
|
|
589 |
return history, conv_state, ""
|
590 |
|
591 |
@gpu_decorator
|
592 |
-
def generate_audio_response(history, ref_audio, ref_text,
|
593 |
"""Generate TTS audio for AI response"""
|
594 |
if not history or not ref_audio:
|
595 |
return None
|
@@ -602,7 +611,7 @@ Have a conversation with an AI using your reference voice!
|
|
602 |
ref_audio,
|
603 |
ref_text,
|
604 |
last_ai_response,
|
605 |
-
|
606 |
remove_silence,
|
607 |
cross_fade_duration=0.15,
|
608 |
speed=1.0,
|
@@ -631,7 +640,7 @@ Have a conversation with an AI using your reference voice!
|
|
631 |
outputs=[chatbot_interface, conversation_state],
|
632 |
).then(
|
633 |
generate_audio_response,
|
634 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
635 |
outputs=[audio_output_chat],
|
636 |
).then(
|
637 |
lambda: None,
|
@@ -646,7 +655,7 @@ Have a conversation with an AI using your reference voice!
|
|
646 |
outputs=[chatbot_interface, conversation_state],
|
647 |
).then(
|
648 |
generate_audio_response,
|
649 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
650 |
outputs=[audio_output_chat],
|
651 |
).then(
|
652 |
lambda: None,
|
@@ -661,7 +670,7 @@ Have a conversation with an AI using your reference voice!
|
|
661 |
outputs=[chatbot_interface, conversation_state],
|
662 |
).then(
|
663 |
generate_audio_response,
|
664 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
665 |
outputs=[audio_output_chat],
|
666 |
).then(
|
667 |
lambda: None,
|
@@ -700,6 +709,24 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
700 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
701 |
"""
|
702 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
703 |
gr.TabbedInterface(
|
704 |
[app_tts, app_multistyle, app_chat, app_credits],
|
705 |
["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
|
|
|
51 |
UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
|
52 |
)
|
53 |
|
54 |
+
DEFAULT_TTS_MODEL = "F5-TTS"
|
55 |
+
tts_model_choice = DEFAULT_TTS_MODEL
|
56 |
chat_model_state = None
|
57 |
chat_tokenizer_state = None
|
58 |
|
|
|
131 |
gr.Markdown("# Batched TTS")
|
132 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
133 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
|
|
134 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
135 |
with gr.Accordion("Advanced Settings", open=False):
|
136 |
ref_text_input = gr.Textbox(
|
|
|
163 |
audio_output = gr.Audio(label="Synthesized Audio")
|
164 |
spectrogram_output = gr.Image(label="Spectrogram")
|
165 |
|
166 |
+
@gpu_decorator
|
167 |
+
def basic_tts(
|
168 |
+
ref_audio_input,
|
169 |
+
ref_text_input,
|
170 |
+
gen_text_input,
|
171 |
+
remove_silence,
|
172 |
+
cross_fade_duration_slider,
|
173 |
+
speed_slider,
|
174 |
+
):
|
175 |
+
return infer(
|
176 |
+
ref_audio_input,
|
177 |
+
ref_text_input,
|
178 |
+
gen_text_input,
|
179 |
+
tts_model_choice,
|
180 |
+
remove_silence,
|
181 |
+
cross_fade_duration_slider,
|
182 |
+
speed_slider,
|
183 |
+
)
|
184 |
+
|
185 |
generate_btn.click(
|
186 |
+
basic_tts,
|
187 |
inputs=[
|
188 |
ref_audio_input,
|
189 |
ref_text_input,
|
190 |
gen_text_input,
|
|
|
191 |
remove_silence,
|
192 |
cross_fade_duration_slider,
|
193 |
speed_slider,
|
|
|
364 |
outputs=gen_text_input_multistyle,
|
365 |
)
|
366 |
|
|
|
|
|
|
|
367 |
with gr.Accordion("Advanced Settings", open=False):
|
368 |
remove_silence_multistyle = gr.Checkbox(
|
369 |
label="Remove Silences",
|
|
|
387 |
speech_type_names_list = args[:num_additional_speech_types]
|
388 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
389 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
|
|
390 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
391 |
|
392 |
# Collect the speech types and their audios into a dict
|
|
|
420 |
|
421 |
# Generate speech for this segment
|
422 |
audio, _ = infer(
|
423 |
+
ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
|
424 |
) # show_info=print no pull to top when generating
|
425 |
sr, audio_data = audio
|
426 |
|
|
|
445 |
+ speech_type_audios
|
446 |
+ speech_type_ref_texts
|
447 |
+ [
|
|
|
448 |
remove_silence_multistyle,
|
449 |
],
|
450 |
outputs=audio_output_multistyle,
|
|
|
532 |
ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
|
533 |
with gr.Column():
|
534 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
535 |
remove_silence_chat = gr.Checkbox(
|
536 |
label="Remove Silences",
|
537 |
value=True,
|
|
|
598 |
return history, conv_state, ""
|
599 |
|
600 |
@gpu_decorator
|
601 |
+
def generate_audio_response(history, ref_audio, ref_text, remove_silence):
|
602 |
"""Generate TTS audio for AI response"""
|
603 |
if not history or not ref_audio:
|
604 |
return None
|
|
|
611 |
ref_audio,
|
612 |
ref_text,
|
613 |
last_ai_response,
|
614 |
+
tts_model_choice,
|
615 |
remove_silence,
|
616 |
cross_fade_duration=0.15,
|
617 |
speed=1.0,
|
|
|
640 |
outputs=[chatbot_interface, conversation_state],
|
641 |
).then(
|
642 |
generate_audio_response,
|
643 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
644 |
outputs=[audio_output_chat],
|
645 |
).then(
|
646 |
lambda: None,
|
|
|
655 |
outputs=[chatbot_interface, conversation_state],
|
656 |
).then(
|
657 |
generate_audio_response,
|
658 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
659 |
outputs=[audio_output_chat],
|
660 |
).then(
|
661 |
lambda: None,
|
|
|
670 |
outputs=[chatbot_interface, conversation_state],
|
671 |
).then(
|
672 |
generate_audio_response,
|
673 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
674 |
outputs=[audio_output_chat],
|
675 |
).then(
|
676 |
lambda: None,
|
|
|
709 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
710 |
"""
|
711 |
)
|
712 |
+
|
713 |
+
def switch_tts_model(new_choice):
|
714 |
+
global tts_model_choice
|
715 |
+
tts_model_choice = new_choice
|
716 |
+
|
717 |
+
if not USING_SPACES:
|
718 |
+
choose_tts_model = gr.Radio(
|
719 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
720 |
+
)
|
721 |
+
else:
|
722 |
+
choose_tts_model = gr.Radio(
|
723 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
724 |
+
)
|
725 |
+
choose_tts_model.change(
|
726 |
+
switch_tts_model,
|
727 |
+
inputs=choose_tts_model,
|
728 |
+
)
|
729 |
+
|
730 |
gr.TabbedInterface(
|
731 |
[app_tts, app_multistyle, app_chat, app_credits],
|
732 |
["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
|
src/f5_tts/infer/utils_infer.py
CHANGED
@@ -282,13 +282,13 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
282 |
audio_data = audio_file.read()
|
283 |
audio_hash = hashlib.md5(audio_data).hexdigest()
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
global asr_pipe
|
293 |
if asr_pipe is None:
|
294 |
initialize_asr_pipeline(device=device)
|
@@ -300,11 +300,10 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
300 |
generate_kwargs={"task": "transcribe"},
|
301 |
return_timestamps=False,
|
302 |
)["text"].strip()
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
_ref_audio_cache[audio_hash] = ref_text
|
308 |
|
309 |
# Ensure ref_text ends with a proper sentence-ending punctuation
|
310 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
@@ -313,6 +312,8 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
313 |
else:
|
314 |
ref_text += ". "
|
315 |
|
|
|
|
|
316 |
return ref_audio, ref_text
|
317 |
|
318 |
|
|
|
282 |
audio_data = audio_file.read()
|
283 |
audio_hash = hashlib.md5(audio_data).hexdigest()
|
284 |
|
285 |
+
if not ref_text.strip():
|
286 |
+
global _ref_audio_cache
|
287 |
+
if audio_hash in _ref_audio_cache:
|
288 |
+
# Use cached asr transcription
|
289 |
+
show_info("Using cached reference text...")
|
290 |
+
ref_text = _ref_audio_cache[audio_hash]
|
291 |
+
else:
|
292 |
global asr_pipe
|
293 |
if asr_pipe is None:
|
294 |
initialize_asr_pipeline(device=device)
|
|
|
300 |
generate_kwargs={"task": "transcribe"},
|
301 |
return_timestamps=False,
|
302 |
)["text"].strip()
|
303 |
+
# Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
|
304 |
+
_ref_audio_cache[audio_hash] = ref_text
|
305 |
+
else:
|
306 |
+
show_info("Using custom reference text...")
|
|
|
307 |
|
308 |
# Ensure ref_text ends with a proper sentence-ending punctuation
|
309 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
|
|
312 |
else:
|
313 |
ref_text += ". "
|
314 |
|
315 |
+
print("ref_text ", ref_text)
|
316 |
+
|
317 |
return ref_audio, ref_text
|
318 |
|
319 |
|