Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 21, 2024

Commit

58f1964

verified ·

1 Parent(s): 62c32a4

Update voice_processing.py

Browse files

Files changed (1) hide show

voice_processing.py +68 -59

voice_processing.py CHANGED Viewed

@@ -6,8 +6,6 @@ import time
 import traceback
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
-import base64
 import edge_tts
 import librosa
@@ -45,13 +43,6 @@ model_root = "weights"
 models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 models.sort()
-def get_voices():
-    return list(voice_mapping.keys())
-def get_model_names():
-    model_root = "weights"  # Adjust this path if your models are stored elsewhere
-    return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 def get_unique_filename(extension):
     return f"{uuid.uuid4()}.{extension}"
@@ -116,6 +107,10 @@ def load_hubert():
         hubert_model = hubert_model.float()
     return hubert_model.eval()
 # Add this helper function to ensure a new event loop is created if none exists
 def run_async_in_thread(fn, *args):
     loop = asyncio.new_event_loop()
@@ -138,47 +133,67 @@ async def tts(
     use_uploaded_voice,
     uploaded_voice,
 ):
     edge_output_filename = get_unique_filename("mp3")
-    try:
-        # Default values for parameters
-        speed = 0
-        f0_up_key = 0
-        f0_method = "rmvpe"
-        protect = 0.33
-        filter_radius = 3
-        resample_sr = 0
-        rms_mix_rate = 0.25
-        edge_time = 0
         if use_uploaded_voice:
             if uploaded_voice is None:
-                raise ValueError("No voice file uploaded.")
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_file.write(uploaded_voice)
                 uploaded_file_path = tmp_file.name
             audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
         else:
             if limitation and len(tts_text) > 12000:
-                raise ValueError(f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.")
             t0 = time.time()
             speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
-            await edge_tts.Communicate(tts_text, tts_voice, rate=speed_str).save(edge_output_filename)
-            edge_time = time.time() - t0
             audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
         duration = len(audio) / sr
         print(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
-            raise ValueError(f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.")
         f0_up_key = int(f0_up_key)
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
         if f0_method == "rmvpe":
             vc.model_rmvpe = rmvpe_model
         times = [0, 0, 0]
         audio_opt = vc.pipeline(
             hubert_model,
@@ -204,49 +219,40 @@ async def tts(
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
-        info = f"Success. Time: tts: {edge_time:.2f}s, npy: {times[0]:.2f}s, f0: {times[1]:.2f}s, infer: {times[2]:.2f}s"
         print(info)
-        # Convert audio to base64
-        with open(edge_output_filename, "rb") as audio_file:
-            audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8')
-        audio_data_uri = f"data:audio/mp3;base64,{audio_base64}"
         return (
             info,
-            audio_data_uri,
-            (tgt_sr, audio_opt)  # Return the target sample rate and audio output
         )
     except Exception as e:
-        print(f"Error in TTS task: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        if os.path.exists(edge_output_filename):
-            os.remove(edge_output_filename)
-        return (str(e), None, None)
 voice_mapping = {
     "Mongolian Male": "mn-MN-BataaNeural",
     "Mongolian Female": "mn-MN-YesuiNeural"
-    # Add more mappings as needed
 }
 hubert_model = load_hubert()
 rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
-# Global semaphore to control concurrency
-max_concurrent_tasks = 16  # Adjust based on server capacity
-semaphore = asyncio.Semaphore(max_concurrent_tasks)
-# Global ThreadPoolExecutor
-executor = ThreadPoolExecutor(max_workers=max_concurrent_tasks)
 class TTSProcessor:
     def __init__(self, config):
         self.config = config
         self.queue = asyncio.Queue()
         self.is_processing = False
@@ -260,28 +266,31 @@ class TTSProcessor:
         return await task
     async def _tts_task(self, model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploaded_voice):
-        async with semaphore:
             return await tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploaded_voice)
     async def _process_queue(self):
         self.is_processing = True
         while not self.queue.empty():
             task = await self.queue.get()
-            try:
-                await task
-            except asyncio.CancelledError:
-                print("Task was cancelled")
-            except Exception as e:
-                print(f"Task failed with error: {e}")
-            finally:
-                self.queue.task_done()
         self.is_processing = False
 # Initialize the TTSProcessor
 tts_processor = TTSProcessor(config)
 async def parallel_tts_processor(tasks):
     return await asyncio.gather(*(tts_processor.tts(*task) for task in tasks))
-async def parallel_tts_wrapper(tasks):
-    return await parallel_tts_processor(tasks)

 import traceback
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
 import edge_tts
 import librosa
 models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 models.sort()
 def get_unique_filename(extension):
     return f"{uuid.uuid4()}.{extension}"
         hubert_model = hubert_model.float()
     return hubert_model.eval()
+def get_model_names():
+    model_root = "weights"  # Assuming this is where your models are stored
+    return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 # Add this helper function to ensure a new event loop is created if none exists
 def run_async_in_thread(fn, *args):
     loop = asyncio.new_event_loop()
     use_uploaded_voice,
     uploaded_voice,
 ):
+    # Default values for parameters used in EdgeTTS
+    speed = 0  # Default speech speed
+    f0_up_key = 0  # Default pitch adjustment
+    f0_method = "rmvpe"  # Default pitch extraction method
+    protect = 0.33  # Default protect value
+    filter_radius = 3
+    resample_sr = 0
+    rms_mix_rate = 0.25
+    edge_time = 0  # Initialize edge_time
     edge_output_filename = get_unique_filename("mp3")
+    try:
         if use_uploaded_voice:
             if uploaded_voice is None:
+                return "No voice file uploaded.", None, None
+            # Process the uploaded voice file
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_file.write(uploaded_voice)
                 uploaded_file_path = tmp_file.name
             audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
         else:
+            # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
+                return (
+                    f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
+                    None,
+                    None,
+                )
+            # Invoke Edge TTS
             t0 = time.time()
             speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
+            await edge_tts.Communicate(
+                tts_text, tts_voice, rate=speed_str
+            ).save(edge_output_filename)
+            t1 = time.time()
+            edge_time = t1 - t0
             audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
+        # Common processing after loading the audio
         duration = len(audio) / sr
         print(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
+            return (
+                f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
+                None,
+                None,
+            )
         f0_up_key = int(f0_up_key)
         tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
+        # Setup for RMVPE or other pitch extraction methods
         if f0_method == "rmvpe":
             vc.model_rmvpe = rmvpe_model
+        # Perform voice conversion pipeline
         times = [0, 0, 0]
         audio_opt = vc.pipeline(
             hubert_model,
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
+        info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
         print(info)
         return (
             info,
+            edge_output_filename if not use_uploaded_voice else None,
+            (tgt_sr, audio_opt),
         )
+    except EOFError:
+        info = (
+            "output not valid. This may occur when input text and speaker do not match."
+        )
+        print(info)
+        return info, None, None
     except Exception as e:
+        traceback_info = traceback.format_exc()
+        print(traceback_info)
+        return str(e), None, None
 voice_mapping = {
     "Mongolian Male": "mn-MN-BataaNeural",
     "Mongolian Female": "mn-MN-YesuiNeural"
 }
 hubert_model = load_hubert()
 rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
+# Add the optimized TTSProcessor
 class TTSProcessor:
     def __init__(self, config):
         self.config = config
+        self.executor = ThreadPoolExecutor(max_workers=config.n_cpu)
+        self.semaphore = asyncio.Semaphore(config.max_concurrent_tts)
         self.queue = asyncio.Queue()
         self.is_processing = False
         return await task
     async def _tts_task(self, model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploaded_voice):
+        async with self.semaphore:
             return await tts(model_name, tts_text, tts_voice, index_rate, use_uploaded_voice, uploaded_voice)
     async def _process_queue(self):
         self.is_processing = True
         while not self.queue.empty():
             task = await self.queue.get()
+            await task
+            self.queue.task_done()
         self.is_processing = False
 # Initialize the TTSProcessor
 tts_processor = TTSProcessor(config)
+# Update parallel_tts to use TTSProcessor
 async def parallel_tts_processor(tasks):
     return await asyncio.gather(*(tts_processor.tts(*task) for task in tasks))
+def parallel_tts_wrapper(tasks):
+    loop = asyncio.get_event_loop()
+    return loop.run_until_complete(parallel_tts_processor(tasks))
+# Keep the original parallel_tts function
+# def parallel_tts(tasks):
+#     with ThreadPoolExecutor() as executor:
+#         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
+#         results = [future.result() for future in futures]
+#     return results