Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -351,24 +351,19 @@ def audio_float_to_int16(
|
|
351 |
audio_norm = audio_norm.astype("int16")
|
352 |
return audio_norm
|
353 |
|
354 |
-
|
|
|
355 |
audios = []
|
356 |
-
|
357 |
-
# Ensure selected_speaker_id is a valid integer or handle it gracefully
|
358 |
-
try:
|
359 |
-
speaker_id = int(selected_speaker_id)
|
360 |
-
except ValueError:
|
361 |
-
# Handle the case where selected_speaker_id is not a valid integer
|
362 |
-
speaker_id = None # Use a default value or handle it differently
|
363 |
-
|
364 |
if config["phoneme_type"] == "PhonemeType.ESPEAK":
|
365 |
config["phoneme_type"] = "espeak"
|
366 |
-
|
367 |
-
text = phonemize(config, line) # Make sure phonemize function is defined
|
368 |
for phonemes in text:
|
369 |
-
phoneme_ids = phonemes_to_ids(config, phonemes)
|
370 |
num_speakers = config["num_speakers"]
|
371 |
-
|
|
|
|
|
|
|
372 |
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
373 |
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
374 |
scales = np.array(
|
@@ -377,8 +372,7 @@ def inferencing(model, config, selected_speaker_id, line, length_scale, noise_sc
|
|
377 |
)
|
378 |
sid = None
|
379 |
if speaker_id is not None:
|
380 |
-
sid = np.
|
381 |
-
|
382 |
audio = model.run(
|
383 |
None,
|
384 |
{
|
@@ -388,9 +382,8 @@ def inferencing(model, config, selected_speaker_id, line, length_scale, noise_sc
|
|
388 |
"sid": sid,
|
389 |
},
|
390 |
)[0].squeeze((0, 1))
|
391 |
-
audio = audio_float_to_int16(audio.squeeze())
|
392 |
audios.append(audio)
|
393 |
-
|
394 |
merged_audio = np.concatenate(audios)
|
395 |
sample_rate = config["audio"]["sample_rate"]
|
396 |
temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
|
|
|
351 |
audio_norm = audio_norm.astype("int16")
|
352 |
return audio_norm
|
353 |
|
354 |
+
|
355 |
+
def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale_w, auto_play=True):
|
356 |
audios = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
if config["phoneme_type"] == "PhonemeType.ESPEAK":
|
358 |
config["phoneme_type"] = "espeak"
|
359 |
+
text = phonemize(config, line)
|
|
|
360 |
for phonemes in text:
|
361 |
+
phoneme_ids = phonemes_to_ids(config, phonemes)
|
362 |
num_speakers = config["num_speakers"]
|
363 |
+
if num_speakers == 1:
|
364 |
+
speaker_id = None # for now
|
365 |
+
else:
|
366 |
+
speaker_id = sid
|
367 |
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
368 |
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
369 |
scales = np.array(
|
|
|
372 |
)
|
373 |
sid = None
|
374 |
if speaker_id is not None:
|
375 |
+
sid = np.asarray([int(speaker_id)], dtype=np.int64) # Convert to 1D array
|
|
|
376 |
audio = model.run(
|
377 |
None,
|
378 |
{
|
|
|
382 |
"sid": sid,
|
383 |
},
|
384 |
)[0].squeeze((0, 1))
|
385 |
+
audio = audio_float_to_int16(audio.squeeze())
|
386 |
audios.append(audio)
|
|
|
387 |
merged_audio = np.concatenate(audios)
|
388 |
sample_rate = config["audio"]["sample_rate"]
|
389 |
temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
|