Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Chillarmo commited on Nov 5

Commit

5dbc09c

•

1 Parent(s): 776e91e

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -144

app.py CHANGED Viewed

@@ -1,198 +1,114 @@
 import gradio as gr
 import torch
-import os
 from outetts.v0_1.interface import InterfaceHF
 import soundfile as sf
 import tempfile
 from faster_whisper import WhisperModel
-from pathlib import Path
-# Configure PyTorch for CPU efficiency
-torch.set_num_threads(4)  # Limit CPU threads
-torch.set_grad_enabled(False)  # Disable gradient computation
-class OptimizedTTSInterface:
-    def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
-        self.interface = InterfaceHF(model_name)
-        # Apply FP16 optimization where possible
-        self.interface.model = self.interface.model.half().float()
-        # Cache commonly used attributes
-        self.tokenizer = self.interface.model.tokenizer
-    def create_speaker(self, *args, **kwargs):
-        with torch.inference_mode():
-            return self.interface.create_speaker(*args, **kwargs)
-    def generate(self, *args, **kwargs):
-        with torch.inference_mode():
-            return self.interface.generate(*args, **kwargs)
 def initialize_models():
-    """Initialize the OptimizedTTS and Faster-Whisper models"""
-    # Create cache directory for models
-    cache_dir = Path("model_cache")
-    cache_dir.mkdir(exist_ok=True)
-    # Set environment variables for better performance
-    os.environ['OMP_NUM_THREADS'] = '4'
-    os.environ['MKL_NUM_THREADS'] = '4'
-    print("Loading ASR model...")
     asr_model = WhisperModel("tiny",
                             device="cpu",
-                            compute_type="int8",
-                            num_workers=1,
-                            cpu_threads=2,
-                            download_root=str(cache_dir))
-    print("Loading TTS model...")
-    tts_interface = OptimizedTTSInterface()
     return tts_interface, asr_model
 def transcribe_audio(audio_path):
     """Transcribe audio using Faster-Whisper tiny"""
     try:
-        segments, _ = ASR_MODEL.transcribe(audio_path,
-                                         beam_size=1,
-                                         best_of=1,
-                                         temperature=1.0,
-                                         condition_on_previous_text=False,
-                                         compression_ratio_threshold=2.4,
-                                         log_prob_threshold=-1.0,
-                                         no_speech_threshold=0.6)
         text = " ".join([segment.text for segment in segments]).strip()
         return text
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
-def preprocess_audio(audio_path):
-    """Preprocess audio to reduce memory usage"""
-    try:
-        # Load and resample audio to 16kHz if needed
-        data, sr = sf.read(audio_path)
-        if sr != 16000:
-            import resampy
-            data = resampy.resample(data, sr, 16000)
-            sr = 16000
-        # Convert to mono if stereo
-        if len(data.shape) > 1:
-            data = data.mean(axis=1)
-        # Normalize audio
-        data = data / max(abs(data.max()), abs(data.min()))
-        # Save preprocessed audio
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        sf.write(temp_file.name, data, sr)
-        return temp_file.name
-    except Exception as e:
-        return audio_path  # Return original if preprocessing fails
 def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
     """Process the audio file and generate speech with the cloned voice"""
     try:
-        # Preprocess audio
-        processed_audio = preprocess_audio(audio_path)
         # If no reference text provided, transcribe the audio
         if not reference_text.strip():
-            reference_text = transcribe_audio(processed_audio)
             if reference_text.startswith("Error"):
                 return None, reference_text
         # Create speaker from reference audio
-        with torch.inference_mode():
-            speaker = TTS_INTERFACE.create_speaker(
-                processed_audio,
-                reference_text
-            )
-            # Generate speech with cloned voice
-            output = TTS_INTERFACE.generate(
-                text=text_to_speak,
-                speaker=speaker,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                max_lenght=4096
-            )
-        # Clean up preprocessed audio if it was created
-        if processed_audio != audio_path:
-            try:
-                os.unlink(processed_audio)
-            except:
-                pass
-        # Save output to temporary file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
         return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
     except Exception as e:
-        if processed_audio != audio_path:
-            try:
-                os.unlink(processed_audio)
-            except:
-                pass
         return None, f"Error: {str(e)}"
-print("Starting initialization...")
-# Initialize models globally
-TTS_INTERFACE, ASR_MODEL = initialize_models()
-print("Models initialized successfully!")
 # Create Gradio interface
 with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
-    gr.Markdown("# 🎙️ Optimized Voice Cloning with OuteTTS")
     gr.Markdown("""
-    This app uses optimized versions of OuteTTS and Whisper for efficient voice cloning on CPU.
-    Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
-    Note: First run may take longer while models are being cached.
     """)
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(
-                label="Upload Reference Audio",
-                type="filepath",
-                source="microphone"
-            )
             reference_text = gr.Textbox(
-                label="Reference Text (leave blank for auto-transcription)",
                 placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
             )
             text_to_speak = gr.Textbox(
-                label="Text to Speak",
                 placeholder="Enter the text you want the cloned voice to speak"
             )
             with gr.Row():
-                temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.1,
-                    step=0.1,
-                    label="Temperature"
-                )
-                repetition_penalty = gr.Slider(
-                    minimum=1.0,
-                    maximum=2.0,
-                    value=1.1,
-                    step=0.1,
-                    label="Repetition Penalty"
-                )
             submit_btn = gr.Button("Generate Voice", variant="primary")
         with gr.Column():
             output_audio = gr.Audio(label="Generated Speech")
             output_message = gr.Textbox(label="Status", max_lines=3)
     submit_btn.click(
         fn=process_audio_file,
         inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
@@ -200,18 +116,14 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
     )
     gr.Markdown("""
-    ### Optimization Notes:
-    - Optimized for CPU performance
-    - Model caching enabled
-    - Memory-efficient inference
-    - Automatic audio preprocessing
     ### Tips for best results:
-    1. Use clear, high-quality reference audio
-    2. Keep reference audio short (5-10 seconds)
-    3. Verify auto-transcription accuracy
-    4. For best quality, manually input exact reference text
-    5. Keep generated text concise
     """)
 if __name__ == "__main__":

 import gradio as gr
 import torch
 from outetts.v0_1.interface import InterfaceHF
 import soundfile as sf
 import tempfile
+import os
 from faster_whisper import WhisperModel
 def initialize_models():
+    """Initialize the OuteTTS and Faster-Whisper models"""
+    tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
+    # Use tiny model with lowest compute settings for maximum speed
     asr_model = WhisperModel("tiny",
                             device="cpu",
+                            compute_type="int8",  # Use int8 quantization for efficiency
+                            num_workers=1,        # Limit workers for low-resource environment
+                            cpu_threads=1)        # Limit CPU threads
     return tts_interface, asr_model
+# Initialize models globally to avoid reloading
+TTS_INTERFACE, ASR_MODEL = initialize_models()
 def transcribe_audio(audio_path):
     """Transcribe audio using Faster-Whisper tiny"""
     try:
+        # Transcribe with minimal settings for speed
+        segments, _ = ASR_MODEL.transcribe(audio_path,
+                                         beam_size=1,           # Reduce beam size
+                                         best_of=1,             # Don't generate alternatives
+                                         temperature=1.0,       # No temperature sampling
+                                         condition_on_previous_text=False,  # Don't condition on previous
+                                         compression_ratio_threshold=2.4,   # Less strict threshold
+                                         log_prob_threshold=-1.0,          # Less strict threshold
+                                         no_speech_threshold=0.6)          # Less strict threshold
+        # Combine all segments
         text = " ".join([segment.text for segment in segments]).strip()
         return text
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
 def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
     """Process the audio file and generate speech with the cloned voice"""
     try:
         # If no reference text provided, transcribe the audio
         if not reference_text.strip():
+            reference_text = transcribe_audio(audio_path)
             if reference_text.startswith("Error"):
                 return None, reference_text
         # Create speaker from reference audio
+        speaker = TTS_INTERFACE.create_speaker(
+            audio_path,
+            reference_text
+        )
+        # Generate speech with cloned voice
+        output = TTS_INTERFACE.generate(
+            text=text_to_speak,
+            speaker=speaker,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            max_lenght=4096
+        )
+        # Save to temporary file and return path
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
         return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
     except Exception as e:
         return None, f"Error: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
+    gr.Markdown("# 🎙️ Voice Cloning with OuteTTS")
     gr.Markdown("""
+    This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
+    Note: For best results, use clear audio with minimal background noise.
     """)
     with gr.Row():
         with gr.Column():
+            # Input components
+            audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
             reference_text = gr.Textbox(
+                label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                 placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
             )
             text_to_speak = gr.Textbox(
+                label="Text to Speak (what you want the cloned voice to say)",
                 placeholder="Enter the text you want the cloned voice to speak"
             )
             with gr.Row():
+                temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
+                                     label="Temperature (higher = more variation)")
+                repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
+                                             label="Repetition Penalty")
+            # Submit button
             submit_btn = gr.Button("Generate Voice", variant="primary")
         with gr.Column():
+            # Output components
             output_audio = gr.Audio(label="Generated Speech")
             output_message = gr.Textbox(label="Status", max_lines=3)
+    # Handle submission
     submit_btn.click(
         fn=process_audio_file,
         inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
     )
     gr.Markdown("""
     ### Tips for best results:
+    1. Use high-quality reference audio (clear speech, minimal background noise)
+    2. If providing reference text manually, ensure it matches the audio exactly
+    3. If using auto-transcription, verify the transcribed text in the status message
+    4. Keep generated text relatively short for better quality
+    5. Adjust temperature and repetition penalty if needed:
+       - Lower temperature (0.1-0.3) for more consistent output
+       - Higher repetition penalty (1.1-1.3) to avoid repetition
     """)
 if __name__ == "__main__":