mrfakename commited on
Commit
7796571
1 Parent(s): 97cf9a5

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (2) hide show
  1. app.py +1 -1
  2. src/f5_tts/infer/utils_infer.py +26 -21
app.py CHANGED
@@ -567,7 +567,7 @@ Have a conversation with an AI using your reference voice!
567
  return history, conv_state, ""
568
 
569
  text = ""
570
- text = preprocess_ref_audio_text(audio_path, text)[1]
571
 
572
  if not text.strip():
573
  return history, conv_state, ""
 
567
  return history, conv_state, ""
568
 
569
  text = ""
570
+ text = preprocess_ref_audio_text(audio_path, text, clip_short=False)[1]
571
 
572
  if not text.strip():
573
  return history, conv_state, ""
src/f5_tts/infer/utils_infer.py CHANGED
@@ -177,36 +177,41 @@ def load_model(model_cls, model_cfg, ckpt_path, vocab_file="", ode_method=ode_me
177
  # preprocess reference audio and text
178
 
179
 
180
- def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print, device=device):
181
  show_info("Converting audio...")
182
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
183
  aseg = AudioSegment.from_file(ref_audio_orig)
184
 
185
- # 1. try to find long silence for clipping
186
- non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000)
187
- non_silent_wave = AudioSegment.silent(duration=0)
188
- for non_silent_seg in non_silent_segs:
189
- if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 16000:
190
- show_info("Audio is over 15s, clipping short.")
191
- break
192
- non_silent_wave += non_silent_seg
193
-
194
- # 2. try to find short silence for clipping if 1. failed
195
- if len(non_silent_wave) > 15000:
196
- non_silent_segs = silence.split_on_silence(aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000)
197
  non_silent_wave = AudioSegment.silent(duration=0)
198
  for non_silent_seg in non_silent_segs:
199
- if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 16000:
200
- show_info("Audio is over 15s, clipping short.")
201
  break
202
  non_silent_wave += non_silent_seg
203
 
204
- aseg = non_silent_wave
205
-
206
- # 3. if no proper silence found for clipping
207
- if len(aseg) > 15000:
208
- aseg = aseg[:15000]
209
- show_info("Audio is over 15s, clipping short.")
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  aseg.export(f.name, format="wav")
212
  ref_audio = f.name
 
177
  # preprocess reference audio and text
178
 
179
 
180
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
181
  show_info("Converting audio...")
182
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
183
  aseg = AudioSegment.from_file(ref_audio_orig)
184
 
185
+ if clip_short:
186
+ # 1. try to find long silence for clipping
187
+ non_silent_segs = silence.split_on_silence(
188
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
189
+ )
 
 
 
 
 
 
 
190
  non_silent_wave = AudioSegment.silent(duration=0)
191
  for non_silent_seg in non_silent_segs:
192
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
193
+ show_info("Audio is over 15s, clipping short. (1)")
194
  break
195
  non_silent_wave += non_silent_seg
196
 
197
+ # 2. try to find short silence for clipping if 1. failed
198
+ if len(non_silent_wave) > 15000:
199
+ non_silent_segs = silence.split_on_silence(
200
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000
201
+ )
202
+ non_silent_wave = AudioSegment.silent(duration=0)
203
+ for non_silent_seg in non_silent_segs:
204
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
205
+ show_info("Audio is over 15s, clipping short. (2)")
206
+ break
207
+ non_silent_wave += non_silent_seg
208
+
209
+ aseg = non_silent_wave
210
+
211
+ # 3. if no proper silence found for clipping
212
+ if len(aseg) > 15000:
213
+ aseg = aseg[:15000]
214
+ show_info("Audio is over 15s, clipping short. (3)")
215
 
216
  aseg.export(f.name, format="wav")
217
  ref_audio = f.name