Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 21, 2024

Commit

1d94066

verified ·

1 Parent(s): 282faa3

Update voice_processing.py

Browse files

Files changed (1) hide show

voice_processing.py +37 -57

voice_processing.py CHANGED Viewed

@@ -14,6 +14,7 @@ from fairseq import checkpoint_utils
 import uuid
 from config import Config
 from lib.infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
@@ -23,6 +24,9 @@ from lib.infer_pack.models import (
 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
 # Set logging levels
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
@@ -34,21 +38,9 @@ limitation = os.getenv("SYSTEM") == "spaces"
 config = Config()
-# Define voice_mapping first to ensure it's always available
-voice_mapping = {
-    "Mongolian Male": "mn-MN-BataaNeural",
-    "Mongolian Female": "mn-MN-YesuiNeural"
-}
 # Edge TTS voices
-try:
-    loop = asyncio.get_event_loop()
-    tts_voice_list = loop.run_until_complete(edge_tts.list_voices())
-    tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"]
-except Exception as e:
-    logging.error(f"Error loading Edge TTS voices: {e}")
-    tts_voice_list = []
-    tts_voices = []
 # RVC models directory
 model_root = "weights"
@@ -58,24 +50,15 @@ models.sort()
 def get_unique_filename(extension):
     return f"{uuid.uuid4()}.{extension}"
-model_cache = {}
 def model_data(model_name):
-    if model_name in model_cache:
-        return model_cache[model_name]
     pth_path = [
         f"{model_root}/{model_name}/{f}"
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
     print(f"Loading {pth_path}")
-    try:
-        cpt = torch.load(pth_path, map_location="cpu")
-    except Exception as e:
-        logging.error(f"Error loading model {pth_path}: {e}")
-        raise e
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
     if_f0 = cpt.get("f0", 1)
@@ -114,32 +97,34 @@ def model_data(model_name):
         index_file = index_files[0]
         print(f"Index file found: {index_file}")
-    model_cache[model_name] = (tgt_sr, net_g, vc, version, index_file, if_f0)
-    return model_cache[model_name]
 def load_hubert():
-    try:
-        models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-            ["hubert_base.pt"],
-            suffix="",
-        )
-        hubert_model = models[0]
-        hubert_model = hubert_model.to(config.device)
-        if config.is_half:
-            hubert_model = hubert_model.half()
-        else:
-            hubert_model = hubert_model.float()
-        return hubert_model.eval()
-    except Exception as e:
-        logging.error(f"Error loading HuBERT model: {e}")
-        raise e
 def get_model_names():
     return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
-# Initialize a global ThreadPoolExecutor
-executor = ThreadPoolExecutor(max_workers=20)  # Adjust based on your server
 def run_async_in_thread(fn, *args):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
@@ -148,8 +133,10 @@ def run_async_in_thread(fn, *args):
     return result
 def parallel_tts(tasks):
-    futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
-    results = [future.result() for future in futures]
     return results
 async def tts(
@@ -187,7 +174,7 @@ async def tts(
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
                 return (
-                    f"Text characters should be at most 12000 in this Hugging Face Space, but got {len(tts_text)} characters.",
                     None,
                     None,
                 )
@@ -206,15 +193,15 @@ async def tts(
         # Common processing after loading the audio
         duration = len(audio) / sr
         print(f"Audio duration: {duration}s")
-        if limitation and duration >= 20:
             return (
-                f"Audio should be less than 20 seconds in this Hugging Face Space, but got {duration}s.",
                 None,
                 None,
             )
         f0_up_key = int(f0_up_key)
-        # Load the model using cached data
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
         # Setup for RMVPE or other pitch extraction methods
@@ -266,10 +253,3 @@ async def tts(
         print(traceback_info)
         return str(e), None, None
-# Initialize the global models
-try:
-    hubert_model = load_hubert()
-    rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
-except Exception as e:
-    logging.error(f"Failed to initialize global models: {e}")
-    # Optionally, you can exit or handle the error as needed

 import uuid
 from config import Config
+from config import Config, voice_mapping
 from lib.infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
+model_cache = {}
 # Set logging levels
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
 config = Config()
 # Edge TTS voices
+tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
+tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"]
 # RVC models directory
 model_root = "weights"
 def get_unique_filename(extension):
     return f"{uuid.uuid4()}.{extension}"
 def model_data(model_name):
+    # We will not modify this function to cache models
     pth_path = [
         f"{model_root}/{model_name}/{f}"
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
     print(f"Loading {pth_path}")
+    cpt = torch.load(pth_path, map_location="cpu")
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
     if_f0 = cpt.get("f0", 1)
         index_file = index_files[0]
         print(f"Index file found: {index_file}")
+    return tgt_sr, net_g, vc, version, index_file, if_f0
 def load_hubert():
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        ["hubert_base.pt"],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    return hubert_model.eval()
 def get_model_names():
     return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
+# Initialize the global models
+hubert_model = load_hubert()
+rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
+voice_mapping = {
+    "Mongolian Male": "mn-MN-BataaNeural",
+    "Mongolian Female": "mn-MN-YesuiNeural"
+}
+# Function to run async functions in a new event loop within a thread
 def run_async_in_thread(fn, *args):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     return result
 def parallel_tts(tasks):
+    # Increase max_workers to better utilize CPU and GPU resources
+    with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust based on your server capacity
+        futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
+        results = [future.result() for future in futures]
     return results
 async def tts(
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
                 return (
+                    f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
                     None,
                     None,
                 )
         # Common processing after loading the audio
         duration = len(audio) / sr
         print(f"Audio duration: {duration}s")
+        if limitation and duration >= 20000:
             return (
+                f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
                 None,
                 None,
             )
         f0_up_key = int(f0_up_key)
+        # Load the model
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
         # Setup for RMVPE or other pitch extraction methods
         print(traceback_info)
         return str(e), None, None