Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 19, 2024

Commit

6f00c3d

verified ·

1 Parent(s): d5183ee

Update voice_processing.py

Browse files

Files changed (1) hide show

voice_processing.py +30 -45

voice_processing.py CHANGED Viewed

@@ -23,11 +23,7 @@ from lib.infer_pack.models import (
 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Set logging levels for other libraries
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
@@ -56,7 +52,7 @@ def model_data(model_name):
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
-    logger.info(f"Loading {pth_path}")
     cpt = torch.load(pth_path, map_location="cpu")
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
@@ -76,7 +72,7 @@ def model_data(model_name):
         raise ValueError("Unknown version")
     del net_g.enc_q
     net_g.load_state_dict(cpt["weight"], strict=False)
-    logger.info("Model loaded")
     net_g.eval().to(config.device)
     if config.is_half:
         net_g = net_g.half()
@@ -90,11 +86,11 @@ def model_data(model_name):
         if f.endswith(".index")
     ]
     if len(index_files) == 0:
-        logger.info("No index file found")
         index_file = ""
     else:
         index_file = index_files[0]
-        logger.info(f"Index file found: {index_file}")
     return tgt_sr, net_g, vc, version, index_file, if_f0
@@ -123,8 +119,6 @@ def run_async_in_thread(fn, *args):
     loop.close()
     return result
-executor = ThreadPoolExecutor(max_workers=config.n_cpu)
 def parallel_tts(tasks):
     with ThreadPoolExecutor() as executor:
         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
@@ -139,24 +133,21 @@ async def tts(
     use_uploaded_voice,
     uploaded_voice,
 ):
-    try:
-        # Default values for parameters used in EdgeTTS
-        speed = 0  # Default speech speed
-        f0_up_key = 0  # Default pitch adjustment
-        f0_method = "rmvpe"  # Default pitch extraction method
-        protect = 0.33  # Default protect value
-        filter_radius = 3
-        resample_sr = 0
-        rms_mix_rate = 0.25
-        edge_time = 0  # Initialize edge_time
-        edge_output_filename = get_unique_filename("mp3")
-        logger.info(f"Starting TTS process for text: {tts_text[:50]}...")
         if use_uploaded_voice:
             if uploaded_voice is None:
-                logger.error("No voice file uploaded.")
                 return "No voice file uploaded.", None, None
             # Process the uploaded voice file
@@ -165,11 +156,9 @@ async def tts(
                 uploaded_file_path = tmp_file.name
             audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
-            logger.info(f"Uploaded voice file loaded. Shape: {audio.shape}, SR: {sr}")
         else:
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
-                logger.error(f"Text characters exceed limit: {len(tts_text)} characters.")
                 return (
                     f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
                     None,
@@ -186,13 +175,11 @@ async def tts(
             edge_time = t1 - t0
             audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
-            logger.info(f"Edge TTS audio generated. Shape: {audio.shape}, SR: {sr}")
         # Common processing after loading the audio
         duration = len(audio) / sr
-        logger.info(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
-            logger.error(f"Audio duration exceeds limit: {duration}s")
             return (
                 f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
                 None,
@@ -208,7 +195,6 @@ async def tts(
         # Perform voice conversion pipeline
         times = [0, 0, 0]
-        logger.info(f"Starting voice conversion with audio shape: {audio.shape}")
         audio_opt = vc.pipeline(
             hubert_model,
             net_g,
@@ -229,22 +215,28 @@ async def tts(
             protect,
             None,
         )
-        logger.info(f"Voice conversion completed. Output shape: {audio_opt.shape}")
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
-        info = f"Success. Time: tts: {edge_time:.2f}s, npy: {times[0]:.2f}s, f0: {times[1]:.2f}s, infer: {times[2]:.2f}s"
-        logger.info(info)
         return (
             info,
             edge_output_filename if not use_uploaded_voice else None,
             (tgt_sr, audio_opt),
         )
     except Exception as e:
-        logger.exception("Error in TTS processing")
-        return str(e), None, (None, None)
 voice_mapping = {
     "Mongolian Male": "mn-MN-BataaNeural",
@@ -294,11 +286,4 @@ async def parallel_tts_processor(tasks):
 def parallel_tts_wrapper(tasks):
     loop = asyncio.get_event_loop()
-    return loop.run_until_complete(parallel_tts_processor(tasks))
-# Keep the original parallel_tts function
-# def parallel_tts(tasks):
-#     with ThreadPoolExecutor() as executor:
-#         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
-#         results = [future.result() for future in futures]
-#     return results

 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
+# Set logging levels
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
+    print(f"Loading {pth_path}")
     cpt = torch.load(pth_path, map_location="cpu")
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
         raise ValueError("Unknown version")
     del net_g.enc_q
     net_g.load_state_dict(cpt["weight"], strict=False)
+    print("Model loaded")
     net_g.eval().to(config.device)
     if config.is_half:
         net_g = net_g.half()
         if f.endswith(".index")
     ]
     if len(index_files) == 0:
+        print("No index file found")
         index_file = ""
     else:
         index_file = index_files[0]
+        print(f"Index file found: {index_file}")
     return tgt_sr, net_g, vc, version, index_file, if_f0
     loop.close()
     return result
 def parallel_tts(tasks):
     with ThreadPoolExecutor() as executor:
         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
     use_uploaded_voice,
     uploaded_voice,
 ):
+    # Default values for parameters used in EdgeTTS
+    speed = 0  # Default speech speed
+    f0_up_key = 0  # Default pitch adjustment
+    f0_method = "rmvpe"  # Default pitch extraction method
+    protect = 0.33  # Default protect value
+    filter_radius = 3
+    resample_sr = 0
+    rms_mix_rate = 0.25
+    edge_time = 0  # Initialize edge_time
+    edge_output_filename = get_unique_filename("mp3")
+    try:
         if use_uploaded_voice:
             if uploaded_voice is None:
                 return "No voice file uploaded.", None, None
             # Process the uploaded voice file
                 uploaded_file_path = tmp_file.name
             audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
         else:
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
                 return (
                     f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
                     None,
             edge_time = t1 - t0
             audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
         # Common processing after loading the audio
         duration = len(audio) / sr
+        print(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
             return (
                 f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
                 None,
         # Perform voice conversion pipeline
         times = [0, 0, 0]
         audio_opt = vc.pipeline(
             hubert_model,
             net_g,
             protect,
             None,
         )
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
+        info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
+        print(info)
         return (
             info,
             edge_output_filename if not use_uploaded_voice else None,
             (tgt_sr, audio_opt),
         )
+    except EOFError:
+        info = (
+            "output not valid. This may occur when input text and speaker do not match."
+        )
+        print(info)
+        return info, None, None
     except Exception as e:
+        traceback_info = traceback.format_exc()
+        print(traceback_info)
+        return str(e), None, None
 voice_mapping = {
     "Mongolian Male": "mn-MN-BataaNeural",
 def parallel_tts_wrapper(tasks):
     loop = asyncio.get_event_loop()
+    return loop.run_until_complete(parallel_tts_processor(tasks))