mrfakename commited on
Commit
971a624
·
verified ·
1 Parent(s): 342cd99

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +53 -31
app.py CHANGED
@@ -31,19 +31,6 @@ def gpu_decorator(func):
31
  else:
32
  return func
33
 
34
-
35
-
36
- SPLIT_WORDS = [
37
- "but", "however", "nevertheless", "yet", "still",
38
- "therefore", "thus", "hence", "consequently",
39
- "moreover", "furthermore", "additionally",
40
- "meanwhile", "alternatively", "otherwise",
41
- "namely", "specifically", "for example", "such as",
42
- "in fact", "indeed", "notably",
43
- "in contrast", "on the other hand", "conversely",
44
- "in conclusion", "to summarize", "finally"
45
- ]
46
-
47
  device = (
48
  "cuda"
49
  if torch.cuda.is_available()
@@ -71,7 +58,6 @@ cfg_strength = 2.0
71
  ode_method = "euler"
72
  sway_sampling_coef = -1.0
73
  speed = 1.0
74
- # fix_duration = 27 # None or float (duration in seconds)
75
  fix_duration = None
76
 
77
 
@@ -142,7 +128,7 @@ def chunk_text(text, max_chars=135):
142
  return chunks
143
 
144
  @gpu_decorator
145
- def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
146
  if exp_name == "F5-TTS":
147
  ema_model = F5TTS_ema_model
148
  elif exp_name == "E2-TTS":
@@ -200,8 +186,44 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
200
  generated_waves.append(generated_wave)
201
  spectrograms.append(generated_mel_spec[0].cpu().numpy())
202
 
203
- # Combine all generated waves
204
- final_wave = np.concatenate(generated_waves)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # Remove silence
207
  if remove_silence:
@@ -227,11 +249,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
227
  return (target_sample_rate, final_wave), spectrogram_path
228
 
229
  @gpu_decorator
230
- def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_split_words=''):
231
- if not custom_split_words.strip():
232
- custom_words = [word.strip() for word in custom_split_words.split(',')]
233
- global SPLIT_WORDS
234
- SPLIT_WORDS = custom_words
235
 
236
  print(gen_text)
237
 
@@ -283,7 +301,8 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
283
  print(f'gen_text {i}', batch_text)
284
 
285
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
286
- return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
 
287
 
288
  @gpu_decorator
289
  def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
@@ -388,12 +407,7 @@ with gr.Blocks() as app_tts:
388
  remove_silence = gr.Checkbox(
389
  label="Remove Silences",
390
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
391
- value=True,
392
- )
393
- split_words_input = gr.Textbox(
394
- label="Custom Split Words",
395
- info="Enter custom words to split on, separated by commas. Leave blank to use default list.",
396
- lines=2,
397
  )
398
  speed_slider = gr.Slider(
399
  label="Speed",
@@ -403,6 +417,14 @@ with gr.Blocks() as app_tts:
403
  step=0.1,
404
  info="Adjust the speed of the audio.",
405
  )
 
 
 
 
 
 
 
 
406
  speed_slider.change(update_speed, inputs=speed_slider)
407
 
408
  audio_output = gr.Audio(label="Synthesized Audio")
@@ -416,7 +438,7 @@ with gr.Blocks() as app_tts:
416
  gen_text_input,
417
  model_choice,
418
  remove_silence,
419
- split_words_input,
420
  ],
421
  outputs=[audio_output, spectrogram_output],
422
  )
@@ -664,7 +686,7 @@ with gr.Blocks() as app_emotional:
664
  ref_text = speech_types[current_emotion].get('ref_text', '')
665
 
666
  # Generate speech for this segment
667
- audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, "")
668
  sr, audio_data = audio
669
 
670
  generated_audio_segments.append(audio_data)
 
31
  else:
32
  return func
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  device = (
35
  "cuda"
36
  if torch.cuda.is_available()
 
58
  ode_method = "euler"
59
  sway_sampling_coef = -1.0
60
  speed = 1.0
 
61
  fix_duration = None
62
 
63
 
 
128
  return chunks
129
 
130
  @gpu_decorator
131
+ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
132
  if exp_name == "F5-TTS":
133
  ema_model = F5TTS_ema_model
134
  elif exp_name == "E2-TTS":
 
186
  generated_waves.append(generated_wave)
187
  spectrograms.append(generated_mel_spec[0].cpu().numpy())
188
 
189
+ # Combine all generated waves with cross-fading
190
+ if cross_fade_duration <= 0:
191
+ # Simply concatenate
192
+ final_wave = np.concatenate(generated_waves)
193
+ else:
194
+ final_wave = generated_waves[0]
195
+ for i in range(1, len(generated_waves)):
196
+ prev_wave = final_wave
197
+ next_wave = generated_waves[i]
198
+
199
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
200
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
201
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
202
+
203
+ if cross_fade_samples <= 0:
204
+ # No overlap possible, concatenate
205
+ final_wave = np.concatenate([prev_wave, next_wave])
206
+ continue
207
+
208
+ # Overlapping parts
209
+ prev_overlap = prev_wave[-cross_fade_samples:]
210
+ next_overlap = next_wave[:cross_fade_samples]
211
+
212
+ # Fade out and fade in
213
+ fade_out = np.linspace(1, 0, cross_fade_samples)
214
+ fade_in = np.linspace(0, 1, cross_fade_samples)
215
+
216
+ # Cross-faded overlap
217
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
218
+
219
+ # Combine
220
+ new_wave = np.concatenate([
221
+ prev_wave[:-cross_fade_samples],
222
+ cross_faded_overlap,
223
+ next_wave[cross_fade_samples:]
224
+ ])
225
+
226
+ final_wave = new_wave
227
 
228
  # Remove silence
229
  if remove_silence:
 
249
  return (target_sample_rate, final_wave), spectrogram_path
250
 
251
  @gpu_decorator
252
+ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
 
 
 
 
253
 
254
  print(gen_text)
255
 
 
301
  print(f'gen_text {i}', batch_text)
302
 
303
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
304
+ return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
305
+
306
 
307
  @gpu_decorator
308
  def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
 
407
  remove_silence = gr.Checkbox(
408
  label="Remove Silences",
409
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
410
+ value=False,
 
 
 
 
 
411
  )
412
  speed_slider = gr.Slider(
413
  label="Speed",
 
417
  step=0.1,
418
  info="Adjust the speed of the audio.",
419
  )
420
+ cross_fade_duration_slider = gr.Slider(
421
+ label="Cross-Fade Duration (s)",
422
+ minimum=0.0,
423
+ maximum=1.0,
424
+ value=0.15,
425
+ step=0.01,
426
+ info="Set the duration of the cross-fade between audio clips.",
427
+ )
428
  speed_slider.change(update_speed, inputs=speed_slider)
429
 
430
  audio_output = gr.Audio(label="Synthesized Audio")
 
438
  gen_text_input,
439
  model_choice,
440
  remove_silence,
441
+ cross_fade_duration_slider,
442
  ],
443
  outputs=[audio_output, spectrogram_output],
444
  )
 
686
  ref_text = speech_types[current_emotion].get('ref_text', '')
687
 
688
  # Generate speech for this segment
689
+ audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
690
  sr, audio_data = audio
691
 
692
  generated_audio_segments.append(audio_data)