Spaces:
Running
on
T4
Running
on
T4
Update tortoise/api.py
Browse files- tortoise/api.py +25 -6
tortoise/api.py
CHANGED
@@ -277,22 +277,41 @@ class TextToSpeech:
|
|
277 |
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
278 |
for audio_frame in self.tts(text, **settings):
|
279 |
yield audio_frame
|
280 |
-
|
281 |
-
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
wav_chunk = wav_gen[:-overlap_len]
|
|
|
|
|
284 |
if wav_gen_prev is not None:
|
285 |
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
|
|
|
|
|
286 |
if wav_overlap is not None:
|
287 |
-
|
288 |
-
|
289 |
-
wav_chunk[:overlap_len]
|
|
|
290 |
wav_chunk[:overlap_len] += crossfade_wav
|
|
|
|
|
291 |
wav_overlap = wav_gen[-overlap_len:]
|
|
|
|
|
292 |
wav_gen_prev = wav_gen
|
|
|
293 |
return wav_chunk, wav_gen_prev, wav_overlap
|
294 |
|
295 |
|
|
|
296 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
297 |
return_deterministic_state=False, overlap_wav_len=1024, stream_chunk_size=40,
|
298 |
# autoregressive generation parameters follow
|
|
|
277 |
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
278 |
for audio_frame in self.tts(text, **settings):
|
279 |
yield audio_frame
|
280 |
+
def handle_chunks(
|
281 |
+
self,
|
282 |
+
wav_gen: torch.Tensor,
|
283 |
+
wav_gen_prev: torch.Tensor,
|
284 |
+
wav_overlap: torch.Tensor,
|
285 |
+
overlap_len: int
|
286 |
+
) -> tuple:
|
287 |
+
"""
|
288 |
+
Handle chunk formatting in streaming mode.
|
289 |
+
"""
|
290 |
+
# Extract the current chunk without overlap
|
291 |
wav_chunk = wav_gen[:-overlap_len]
|
292 |
+
|
293 |
+
# If there's a previous chunk, extract the portion that's not overlapping
|
294 |
if wav_gen_prev is not None:
|
295 |
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
|
296 |
+
|
297 |
+
# Perform the crossfade if there is an overlap
|
298 |
if wav_overlap is not None:
|
299 |
+
crossfade_window = torch.linspace(0.0, 1.0, overlap_len).to(wav_gen.device)
|
300 |
+
|
301 |
+
crossfade_wav = wav_chunk[:overlap_len] * crossfade_window
|
302 |
+
wav_chunk[:overlap_len] = wav_overlap * (1 - crossfade_window)
|
303 |
wav_chunk[:overlap_len] += crossfade_wav
|
304 |
+
|
305 |
+
# Save the last part of this chunk for overlapping with the next chunk
|
306 |
wav_overlap = wav_gen[-overlap_len:]
|
307 |
+
|
308 |
+
# Update wav_gen_prev for the next iteration
|
309 |
wav_gen_prev = wav_gen
|
310 |
+
|
311 |
return wav_chunk, wav_gen_prev, wav_overlap
|
312 |
|
313 |
|
314 |
+
|
315 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
|
316 |
return_deterministic_state=False, overlap_wav_len=1024, stream_chunk_size=40,
|
317 |
# autoregressive generation parameters follow
|