Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -351,18 +351,24 @@ def audio_float_to_int16(
|
|
351 |
audio_norm = audio_norm.astype("int16")
|
352 |
return audio_norm
|
353 |
|
354 |
-
def inferencing(model, config,
|
355 |
audios = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
if config["phoneme_type"] == "PhonemeType.ESPEAK":
|
357 |
config["phoneme_type"] = "espeak"
|
358 |
-
|
|
|
359 |
for phonemes in text:
|
360 |
-
phoneme_ids = phonemes_to_ids(config, phonemes)
|
361 |
num_speakers = config["num_speakers"]
|
362 |
-
|
363 |
-
speaker_id = None # for now
|
364 |
-
else:
|
365 |
-
speaker_id = sid
|
366 |
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
367 |
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
368 |
scales = np.array(
|
@@ -371,8 +377,8 @@ def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale
|
|
371 |
)
|
372 |
sid = None
|
373 |
if speaker_id is not None:
|
374 |
-
|
375 |
-
|
376 |
audio = model.run(
|
377 |
None,
|
378 |
{
|
@@ -382,18 +388,15 @@ def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale
|
|
382 |
"sid": sid,
|
383 |
},
|
384 |
)[0].squeeze((0, 1))
|
385 |
-
audio = audio_float_to_int16(audio.squeeze())
|
386 |
audios.append(audio)
|
|
|
387 |
merged_audio = np.concatenate(audios)
|
388 |
sample_rate = config["audio"]["sample_rate"]
|
389 |
temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
|
390 |
sf.write(temp_audio_path, merged_audio, config["audio"]["sample_rate"])
|
391 |
audio = AudioSegment.from_mp3(temp_audio_path)
|
392 |
return audio
|
393 |
-
# return FileResponse(temp_audio_path)
|
394 |
-
# Return the audio file as a FastAPI response
|
395 |
-
# display(Markdown(f"{line}"))
|
396 |
-
# display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))
|
397 |
|
398 |
def denoise(
|
399 |
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
|
|
|
351 |
audio_norm = audio_norm.astype("int16")
|
352 |
return audio_norm
|
353 |
|
354 |
+
def inferencing(model, config, selected_speaker_id, line, length_scale, noise_scale, noise_scale_w, auto_play=True):
|
355 |
audios = []
|
356 |
+
|
357 |
+
# Ensure selected_speaker_id is a valid integer or handle it gracefully
|
358 |
+
try:
|
359 |
+
speaker_id = int(selected_speaker_id)
|
360 |
+
except ValueError:
|
361 |
+
# Handle the case where selected_speaker_id is not a valid integer
|
362 |
+
speaker_id = None # Use a default value or handle it differently
|
363 |
+
|
364 |
if config["phoneme_type"] == "PhonemeType.ESPEAK":
|
365 |
config["phoneme_type"] = "espeak"
|
366 |
+
|
367 |
+
text = phonemize(config, line) # Make sure phonemize function is defined
|
368 |
for phonemes in text:
|
369 |
+
phoneme_ids = phonemes_to_ids(config, phonemes) # Make sure phonemes_to_ids function is defined
|
370 |
num_speakers = config["num_speakers"]
|
371 |
+
|
|
|
|
|
|
|
372 |
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
373 |
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
374 |
scales = np.array(
|
|
|
377 |
)
|
378 |
sid = None
|
379 |
if speaker_id is not None:
|
380 |
+
sid = np.array([speaker_id], dtype=np.int64) # Ensure sid is a 1D array
|
381 |
+
|
382 |
audio = model.run(
|
383 |
None,
|
384 |
{
|
|
|
388 |
"sid": sid,
|
389 |
},
|
390 |
)[0].squeeze((0, 1))
|
391 |
+
audio = audio_float_to_int16(audio.squeeze()) # Make sure audio_float_to_int16 is defined
|
392 |
audios.append(audio)
|
393 |
+
|
394 |
merged_audio = np.concatenate(audios)
|
395 |
sample_rate = config["audio"]["sample_rate"]
|
396 |
temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
|
397 |
sf.write(temp_audio_path, merged_audio, config["audio"]["sample_rate"])
|
398 |
audio = AudioSegment.from_mp3(temp_audio_path)
|
399 |
return audio
|
|
|
|
|
|
|
|
|
400 |
|
401 |
def denoise(
|
402 |
audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
|