Gregniuki commited on
Commit
94f9450
·
1 Parent(s): e13e1dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -14
app.py CHANGED
@@ -351,18 +351,24 @@ def audio_float_to_int16(
351
  audio_norm = audio_norm.astype("int16")
352
  return audio_norm
353
 
354
- def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale_w, auto_play=True):
355
  audios = []
 
 
 
 
 
 
 
 
356
  if config["phoneme_type"] == "PhonemeType.ESPEAK":
357
  config["phoneme_type"] = "espeak"
358
- text = phonemize(config, line)
 
359
  for phonemes in text:
360
- phoneme_ids = phonemes_to_ids(config, phonemes)
361
  num_speakers = config["num_speakers"]
362
- if num_speakers == 1:
363
- speaker_id = None # for now
364
- else:
365
- speaker_id = sid
366
  text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
367
  text_lengths = np.array([text.shape[1]], dtype=np.int64)
368
  scales = np.array(
@@ -371,8 +377,8 @@ def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale
371
  )
372
  sid = None
373
  if speaker_id is not None:
374
-
375
- sid = np.array([int(speaker_id)], dtype=np.int64) # Ensure sid is a 1D array
376
  audio = model.run(
377
  None,
378
  {
@@ -382,18 +388,15 @@ def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale
382
  "sid": sid,
383
  },
384
  )[0].squeeze((0, 1))
385
- audio = audio_float_to_int16(audio.squeeze())
386
  audios.append(audio)
 
387
  merged_audio = np.concatenate(audios)
388
  sample_rate = config["audio"]["sample_rate"]
389
  temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
390
  sf.write(temp_audio_path, merged_audio, config["audio"]["sample_rate"])
391
  audio = AudioSegment.from_mp3(temp_audio_path)
392
  return audio
393
- # return FileResponse(temp_audio_path)
394
- # Return the audio file as a FastAPI response
395
- # display(Markdown(f"{line}"))
396
- # display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))
397
 
398
  def denoise(
399
  audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
 
351
  audio_norm = audio_norm.astype("int16")
352
  return audio_norm
353
 
354
+ def inferencing(model, config, selected_speaker_id, line, length_scale, noise_scale, noise_scale_w, auto_play=True):
355
  audios = []
356
+
357
+ # Ensure selected_speaker_id is a valid integer or handle it gracefully
358
+ try:
359
+ speaker_id = int(selected_speaker_id)
360
+ except ValueError:
361
+ # Handle the case where selected_speaker_id is not a valid integer
362
+ speaker_id = None # Use a default value or handle it differently
363
+
364
  if config["phoneme_type"] == "PhonemeType.ESPEAK":
365
  config["phoneme_type"] = "espeak"
366
+
367
+ text = phonemize(config, line) # Make sure phonemize function is defined
368
  for phonemes in text:
369
+ phoneme_ids = phonemes_to_ids(config, phonemes) # Make sure phonemes_to_ids function is defined
370
  num_speakers = config["num_speakers"]
371
+
 
 
 
372
  text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
373
  text_lengths = np.array([text.shape[1]], dtype=np.int64)
374
  scales = np.array(
 
377
  )
378
  sid = None
379
  if speaker_id is not None:
380
+ sid = np.array([speaker_id], dtype=np.int64) # Ensure sid is a 1D array
381
+
382
  audio = model.run(
383
  None,
384
  {
 
388
  "sid": sid,
389
  },
390
  )[0].squeeze((0, 1))
391
+ audio = audio_float_to_int16(audio.squeeze()) # Make sure audio_float_to_int16 is defined
392
  audios.append(audio)
393
+
394
  merged_audio = np.concatenate(audios)
395
  sample_rate = config["audio"]["sample_rate"]
396
  temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
397
  sf.write(temp_audio_path, merged_audio, config["audio"]["sample_rate"])
398
  audio = AudioSegment.from_mp3(temp_audio_path)
399
  return audio
 
 
 
 
400
 
401
  def denoise(
402
  audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float