Spaces:

mazalaai
/

tts

Sleeping

MAZALA2024 commited on Dec 14, 2024

Commit

13a3616

verified ·

1 Parent(s): 8ebe7fa

Update voice_processing.py

Files changed (1) hide show

voice_processing.py CHANGED Viewed

@@ -92,20 +92,22 @@ def process_audio(model, audio_file, logger, index_rate=0, use_uploaded_voice=Tr
     try:
         logger.info("Starting audio processing")
-        if model is None:
-            logger.error("No model provided for processing")
-            return None
-        # Load and process audio
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
-        if f0_method == "rmvpe":
-            vc.model_rmvpe = rmvpe_model
         times = [0, 0, 0]
         audio_opt = vc.pipeline(
-            hubert_model,
-            net_g,
-            0,  # sid
             audio,
             audio_file,
             times,
@@ -123,6 +125,9 @@ def process_audio(model, audio_file, logger, index_rate=0, use_uploaded_voice=Tr
             f0_file=None
         )
         info = f"Success. Time: npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
         logger.info(info)
         return (info, None, (tgt_sr, audio_opt))

     try:
         logger.info("Starting audio processing")
+        # Load audio using librosa directly (matching original working code)
+        audio, sr = librosa.load(audio_file, sr=16000, mono=True)
+        logger.info(f"Loaded audio: sr={sr}Hz, shape={audio.shape}")
+        # Get model data using existing function
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
+        # Set RMVPE
+        vc.model_rmvpe = rmvpe_model
+        # Process using the VC pipeline that we know works
         times = [0, 0, 0]
         audio_opt = vc.pipeline(
+            hubert_model,  # Use global hubert model
+            net_g,  # Use the generator from model_data
+            0,  # speaker id
             audio,
             audio_file,
             times,
             f0_file=None
         )
+        if tgt_sr != 0 and tgt_sr >= 16000:
+            tgt_sr = resample_sr
         info = f"Success. Time: npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
         logger.info(info)
         return (info, None, (tgt_sr, audio_opt))