Spaces:
Running
on
Zero
Running
on
Zero
mrfakename
commited on
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -10,7 +10,6 @@ import numpy as np
|
|
10 |
import soundfile as sf
|
11 |
import torchaudio
|
12 |
from cached_path import cached_path
|
13 |
-
from pydub import AudioSegment
|
14 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
15 |
|
16 |
try:
|
@@ -114,58 +113,6 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
|
|
114 |
return (final_sample_rate, final_wave), spectrogram_path
|
115 |
|
116 |
|
117 |
-
@gpu_decorator
|
118 |
-
def generate_podcast(
|
119 |
-
script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, model, remove_silence
|
120 |
-
):
|
121 |
-
# Split the script into speaker blocks
|
122 |
-
speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
|
123 |
-
speaker_blocks = speaker_pattern.split(script)[1:] # Skip the first empty element
|
124 |
-
|
125 |
-
generated_audio_segments = []
|
126 |
-
|
127 |
-
for i in range(0, len(speaker_blocks), 2):
|
128 |
-
speaker = speaker_blocks[i]
|
129 |
-
text = speaker_blocks[i + 1].strip()
|
130 |
-
|
131 |
-
# Determine which speaker is talking
|
132 |
-
if speaker == speaker1_name:
|
133 |
-
ref_audio = ref_audio1
|
134 |
-
ref_text = ref_text1
|
135 |
-
elif speaker == speaker2_name:
|
136 |
-
ref_audio = ref_audio2
|
137 |
-
ref_text = ref_text2
|
138 |
-
else:
|
139 |
-
continue # Skip if the speaker is neither speaker1 nor speaker2
|
140 |
-
|
141 |
-
# Generate audio for this block
|
142 |
-
audio, _ = infer(ref_audio, ref_text, text, model, remove_silence)
|
143 |
-
|
144 |
-
# Convert the generated audio to a numpy array
|
145 |
-
sr, audio_data = audio
|
146 |
-
|
147 |
-
# Save the audio data as a WAV file
|
148 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
149 |
-
sf.write(temp_file.name, audio_data, sr)
|
150 |
-
audio_segment = AudioSegment.from_wav(temp_file.name)
|
151 |
-
|
152 |
-
generated_audio_segments.append(audio_segment)
|
153 |
-
|
154 |
-
# Add a short pause between speakers
|
155 |
-
pause = AudioSegment.silent(duration=500) # 500ms pause
|
156 |
-
generated_audio_segments.append(pause)
|
157 |
-
|
158 |
-
# Concatenate all audio segments
|
159 |
-
final_podcast = sum(generated_audio_segments)
|
160 |
-
|
161 |
-
# Export the final podcast
|
162 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
163 |
-
podcast_path = temp_file.name
|
164 |
-
final_podcast.export(podcast_path, format="wav")
|
165 |
-
|
166 |
-
return podcast_path
|
167 |
-
|
168 |
-
|
169 |
with gr.Blocks() as app_credits:
|
170 |
gr.Markdown("""
|
171 |
# Credits
|
@@ -225,53 +172,6 @@ with gr.Blocks() as app_tts:
|
|
225 |
outputs=[audio_output, spectrogram_output],
|
226 |
)
|
227 |
|
228 |
-
with gr.Blocks() as app_podcast:
|
229 |
-
gr.Markdown("# Podcast Generation")
|
230 |
-
speaker1_name = gr.Textbox(label="Speaker 1 Name")
|
231 |
-
ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
|
232 |
-
ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
|
233 |
-
|
234 |
-
speaker2_name = gr.Textbox(label="Speaker 2 Name")
|
235 |
-
ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
|
236 |
-
ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
|
237 |
-
|
238 |
-
script_input = gr.Textbox(
|
239 |
-
label="Podcast Script",
|
240 |
-
lines=10,
|
241 |
-
placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...",
|
242 |
-
)
|
243 |
-
|
244 |
-
podcast_model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
245 |
-
podcast_remove_silence = gr.Checkbox(
|
246 |
-
label="Remove Silences",
|
247 |
-
value=True,
|
248 |
-
)
|
249 |
-
generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
|
250 |
-
podcast_output = gr.Audio(label="Generated Podcast")
|
251 |
-
|
252 |
-
def podcast_generation(
|
253 |
-
script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence
|
254 |
-
):
|
255 |
-
return generate_podcast(
|
256 |
-
script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence
|
257 |
-
)
|
258 |
-
|
259 |
-
generate_podcast_btn.click(
|
260 |
-
podcast_generation,
|
261 |
-
inputs=[
|
262 |
-
script_input,
|
263 |
-
speaker1_name,
|
264 |
-
ref_audio_input1,
|
265 |
-
ref_text_input1,
|
266 |
-
speaker2_name,
|
267 |
-
ref_audio_input2,
|
268 |
-
ref_text_input2,
|
269 |
-
podcast_model_choice,
|
270 |
-
podcast_remove_silence,
|
271 |
-
],
|
272 |
-
outputs=podcast_output,
|
273 |
-
)
|
274 |
-
|
275 |
|
276 |
def parse_speechtypes_text(gen_text):
|
277 |
# Pattern to find {speechtype}
|
@@ -298,7 +198,7 @@ def parse_speechtypes_text(gen_text):
|
|
298 |
return segments
|
299 |
|
300 |
|
301 |
-
with gr.Blocks() as
|
302 |
# New section for emotional generation
|
303 |
gr.Markdown(
|
304 |
"""
|
@@ -306,9 +206,13 @@ with gr.Blocks() as app_emotional:
|
|
306 |
|
307 |
This section allows you to upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the "Add Speech Type" button. Enter your text in the format shown below, and the system will generate speech using the appropriate emotions. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
|
308 |
|
309 |
-
**Example Input:**
|
310 |
-
|
311 |
-
{
|
|
|
|
|
|
|
|
|
312 |
"""
|
313 |
)
|
314 |
|
@@ -392,7 +296,11 @@ with gr.Blocks() as app_emotional:
|
|
392 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
393 |
|
394 |
# Text input for the prompt
|
395 |
-
gen_text_input_emotional = gr.Textbox(
|
|
|
|
|
|
|
|
|
396 |
|
397 |
# Model choice
|
398 |
model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
@@ -694,8 +602,8 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
694 |
"""
|
695 |
)
|
696 |
gr.TabbedInterface(
|
697 |
-
[app_tts,
|
698 |
-
["TTS", "
|
699 |
)
|
700 |
|
701 |
|
|
|
10 |
import soundfile as sf
|
11 |
import torchaudio
|
12 |
from cached_path import cached_path
|
|
|
13 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
14 |
|
15 |
try:
|
|
|
113 |
return (final_sample_rate, final_wave), spectrogram_path
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
with gr.Blocks() as app_credits:
|
117 |
gr.Markdown("""
|
118 |
# Credits
|
|
|
172 |
outputs=[audio_output, spectrogram_output],
|
173 |
)
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
def parse_speechtypes_text(gen_text):
|
177 |
# Pattern to find {speechtype}
|
|
|
198 |
return segments
|
199 |
|
200 |
|
201 |
+
with gr.Blocks() as app_multistyle:
|
202 |
# New section for emotional generation
|
203 |
gr.Markdown(
|
204 |
"""
|
|
|
206 |
|
207 |
This section allows you to upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the "Add Speech Type" button. Enter your text in the format shown below, and the system will generate speech using the appropriate emotions. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
|
208 |
|
209 |
+
**Example Input:**
|
210 |
+
{Regular} Hello, I'd like to order a sandwich please.
|
211 |
+
{Surprised} What do you mean you're out of bread?
|
212 |
+
{Sad} I really wanted a sandwich though...
|
213 |
+
{Angry} You know what, darn you and your little shop!
|
214 |
+
{Whisper} I'll just go back home and cry now.
|
215 |
+
{Shouting} Why me?!
|
216 |
"""
|
217 |
)
|
218 |
|
|
|
296 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
297 |
|
298 |
# Text input for the prompt
|
299 |
+
gen_text_input_emotional = gr.Textbox(
|
300 |
+
label="Text to Generate ( Make sure the type names you entered match the Speech Type Name above ! ! ! )",
|
301 |
+
lines=10,
|
302 |
+
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
303 |
+
)
|
304 |
|
305 |
# Model choice
|
306 |
model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
|
|
602 |
"""
|
603 |
)
|
604 |
gr.TabbedInterface(
|
605 |
+
[app_tts, app_multistyle, app_chat, app_credits],
|
606 |
+
["TTS", "Multi-Style", "Voice-Chat", "Credits"],
|
607 |
)
|
608 |
|
609 |
|