Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Chillarmo commited on Nov 5

Commit

7c62735

•

1 Parent(s): 0ef49b2

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -14

app.py CHANGED Viewed

@@ -1,20 +1,32 @@
 import gradio as gr
 import torch
-from outetts.v0_1.interface import InterfaceHF
 import soundfile as sf
 import tempfile
 import os
 from faster_whisper import WhisperModel
 def initialize_models():
     """Initialize the OuteTTS and Faster-Whisper models"""
-    tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
-    # Use tiny model with lowest compute settings for maximum speed
     asr_model = WhisperModel("tiny",
                             device="cpu",
-                            compute_type="int8",  # Use int8 quantization for efficiency
-                            num_workers=1,        # Limit workers for low-resource environment
-                            cpu_threads=1)        # Limit CPU threads
     return tts_interface, asr_model
 # Initialize models globally to avoid reloading
@@ -23,17 +35,15 @@ TTS_INTERFACE, ASR_MODEL = initialize_models()
 def transcribe_audio(audio_path):
     """Transcribe audio using Faster-Whisper tiny"""
     try:
-        # Transcribe with minimal settings for speed
         segments, _ = ASR_MODEL.transcribe(audio_path,
-                                         beam_size=1,
-                                         best_of=1,
-                                         temperature=1.0,
                                          condition_on_previous_text=False,
                                          compression_ratio_threshold=2.4,
                                          log_prob_threshold=-1.0,
                                          no_speech_threshold=0.6)
-        # Combine all segments
         text = " ".join([segment.text for segment in segments]).strip()
         return text
     except Exception as e:
@@ -77,10 +87,11 @@ Reference text: {reference_text[:500]}...
         return None, f"Error: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
-    gr.Markdown("# 🎙️ Voice Cloning with OuteTTS")
     gr.Markdown("""
-    This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
     Note:

 import gradio as gr
 import torch
+from outetts.v0_1.interface import InterfaceGGUF
 import soundfile as sf
 import tempfile
 import os
 from faster_whisper import WhisperModel
+import huggingface_hub
+def download_model():
+    """Download the GGUF model from HuggingFace"""
+    model_path = huggingface_hub.hf_hub_download(
+        repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
+        filename="outetts-0.1-350m.gguf"
+    )
+    return model_path
 def initialize_models():
     """Initialize the OuteTTS and Faster-Whisper models"""
+    # Download and initialize GGUF model
+    model_path = download_model()
+    tts_interface = InterfaceGGUF(model_path)
+    # Initialize Whisper
     asr_model = WhisperModel("tiny",
                             device="cpu",
+                            compute_type="int8",
+                            num_workers=1,
+                            cpu_threads=1)
     return tts_interface, asr_model
 # Initialize models globally to avoid reloading
 def transcribe_audio(audio_path):
     """Transcribe audio using Faster-Whisper tiny"""
     try:
         segments, _ = ASR_MODEL.transcribe(audio_path,
+                                         beam_size=1,
+                                         best_of=1,
+                                         temperature=1.0,
                                          condition_on_previous_text=False,
                                          compression_ratio_threshold=2.4,
                                          log_prob_threshold=-1.0,
                                          no_speech_threshold=0.6)
         text = " ".join([segment.text for segment in segments]).strip()
         return text
     except Exception as e:
         return None, f"Error: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
+    gr.Markdown("# 🎙️ Voice Cloning with OuteTTS (GGUF)")
     gr.Markdown("""
+    This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file,
+    provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
     Note: