Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Chillarmo commited on Nov 5

Commit

cc2340f

•

1 Parent(s): 71c72c2

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -49

app.py CHANGED Viewed

@@ -11,15 +11,21 @@ def download_model():
     """Download the GGUF model from HuggingFace"""
     model_path = huggingface_hub.hf_hub_download(
         repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
-        filename="OuteTTS-0.1-350M-Q6_K.gguf"
     )
     return model_path
 def initialize_models():
     """Initialize the OuteTTS and Faster-Whisper models"""
-    # Download and initialize GGUF model
     model_path = download_model()
-    tts_interface = InterfaceGGUF(model_path)
     # Initialize Whisper
     asr_model = WhisperModel("tiny",
@@ -30,24 +36,11 @@ def initialize_models():
     return tts_interface, asr_model
 # Initialize models globally to avoid reloading
-TTS_INTERFACE, ASR_MODEL = initialize_models()
-def transcribe_audio(audio_path):
-    """Transcribe audio using Faster-Whisper tiny"""
-    try:
-        segments, _ = ASR_MODEL.transcribe(audio_path,
-                                         beam_size=1,
-                                         best_of=1,
-                                         temperature=1.0,
-                                         condition_on_previous_text=False,
-                                         compression_ratio_threshold=2.4,
-                                         log_prob_threshold=-1.0,
-                                         no_speech_threshold=0.6)
-        text = " ".join([segment.text for segment in segments]).strip()
-        return text
-    except Exception as e:
-        return f"Error transcribing audio: {str(e)}"
 def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
     """Process the audio file and generate speech with the cloned voice"""
@@ -60,28 +53,32 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
                 return None, reference_text
         gr.Info(f"Using reference text: {reference_text}")
         # Create speaker from reference audio
         speaker = TTS_INTERFACE.create_speaker(
             audio_path,
-            reference_text[:4000]  # Limit reference text length
         )
         # Generate speech with cloned voice
         output = TTS_INTERFACE.generate(
-            text=text_to_speak[:500],  # Limit output text length
             speaker=speaker,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
-            max_lenght=2048  # Reduced from 4096 to avoid errors
         )
         # Save to temporary file and return path
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
         return temp_file.name, f"""Processing complete!
-Reference text: {reference_text[:500]}...
-(Showing first 500 characters of reference text)"""
     except Exception as e:
         return None, f"Error: {str(e)}"
@@ -90,40 +87,56 @@ Reference text: {reference_text[:500]}...
 with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
     gr.Markdown("# 🎙️ Voice Cloning with OuteTTS (GGUF)")
     gr.Markdown("""
-    This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file,
     provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
     Note:
     - For best results, use clear audio with minimal background noise
-    - Reference text is limited to 4000 characters
-    - Output text is limited to 500 characters
     """)
     with gr.Row():
         with gr.Column():
             # Input components
-            audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
             with gr.Row():
                 transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")
             reference_text = gr.Textbox(
                 label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                 placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
-                lines=3
             )
             text_to_speak = gr.Textbox(
-                label="Text to Speak (what you want the cloned voice to say, max 500 characters)",
-                placeholder="Enter the text you want the cloned voice to speak",
                 lines=3,
                 max_lines=5
             )
             with gr.Row():
-                temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
-                                     label="Temperature (higher = more variation)")
-                repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
-                                             label="Repetition Penalty")
             # Submit button
             submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")
@@ -132,15 +145,37 @@ with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
             # Output components
             output_audio = gr.Audio(label="Generated Speech")
             output_message = gr.Textbox(label="Status", lines=4)
     # Handle transcription button
-    def transcribe_button(audio):
-        if not audio:
-            return "Please upload audio first."
-        return transcribe_audio(audio)
     transcribe_btn.click(
-        fn=transcribe_button,
         inputs=[audio_input],
         outputs=[reference_text],
     )
@@ -154,13 +189,15 @@ with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
     gr.Markdown("""
     ### Tips for best results:
-    1. Use high-quality reference audio (clear speech, minimal background noise)
-    2. Try to keep reference audio under 30 seconds
-    3. If auto-transcription isn't accurate, you can manually correct the text
-    4. Keep generated text short for better quality
-    5. Adjust temperature and repetition penalty if needed:
-       - Lower temperature (0.1-0.3) for more consistent output
-       - Higher repetition penalty (1.1-1.3) to avoid repetition
     """)
 if __name__ == "__main__":

     """Download the GGUF model from HuggingFace"""
     model_path = huggingface_hub.hf_hub_download(
         repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
+        filename="outetts-0.1-350m.gguf"
     )
     return model_path
 def initialize_models():
     """Initialize the OuteTTS and Faster-Whisper models"""
+    # Download and initialize GGUF model with adjusted parameters
     model_path = download_model()
+    tts_interface = InterfaceGGUF(
+        model_path,
+        n_ctx=2048,          # Reduced context size
+        n_batch=512,         # Reduced batch size
+        n_threads=4,         # Adjust based on CPU
+        verbose=False,       # Reduce logging
+    )
     # Initialize Whisper
     asr_model = WhisperModel("tiny",
     return tts_interface, asr_model
 # Initialize models globally to avoid reloading
+try:
+    TTS_INTERFACE, ASR_MODEL = initialize_models()
+except Exception as e:
+    print(f"Error initializing models: {str(e)}")
+    raise
 def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
     """Process the audio file and generate speech with the cloned voice"""
                 return None, reference_text
         gr.Info(f"Using reference text: {reference_text}")
+        # Limit text lengths to prevent context overflow
+        reference_text = reference_text[:2000]  # Further reduced
+        text_to_speak = text_to_speak[:300]     # Further reduced
         # Create speaker from reference audio
         speaker = TTS_INTERFACE.create_speaker(
             audio_path,
+            reference_text,
         )
         # Generate speech with cloned voice
         output = TTS_INTERFACE.generate(
+            text=text_to_speak,
             speaker=speaker,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
+            max_lenght=1024  # Reduced from 2048
         )
         # Save to temporary file and return path
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         output.save(temp_file.name)
         return temp_file.name, f"""Processing complete!
+Reference text: {reference_text[:300]}...
+(Showing first 300 characters of reference text)"""
     except Exception as e:
         return None, f"Error: {str(e)}"
 with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
     gr.Markdown("# 🎙️ Voice Cloning with OuteTTS (GGUF)")
     gr.Markdown("""
+    This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file,
     provide the text being spoken in that audio (or leave blank for automatic transcription),
     and enter the new text you want to be spoken in the cloned voice.
     Note:
     - For best results, use clear audio with minimal background noise
+    - Reference text is limited to 2000 characters
+    - Output text is limited to 300 characters
+    - Short inputs work best for quality results
     """)
     with gr.Row():
         with gr.Column():
             # Input components
+            audio_input = gr.Audio(
+                label="Upload Reference Audio",
+                type="filepath",
+                max_length=30  # Limit audio length to 30 seconds
+            )
             with gr.Row():
                 transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")
             reference_text = gr.Textbox(
                 label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                 placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
+                lines=3,
+                max_lines=5
             )
             text_to_speak = gr.Textbox(
+                label="Text to Speak (what you want the cloned voice to say, max 300 characters)",
+                placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)",
                 lines=3,
                 max_lines=5
             )
             with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.5,  # Reduced maximum temperature
+                    value=0.1,
+                    step=0.05,
+                    label="Temperature (keep low for stability)"
+                )
+                repetition_penalty = gr.Slider(
+                    minimum=1.0,
+                    maximum=1.3,  # Reduced maximum
+                    value=1.1,
+                    step=0.05,
+                    label="Repetition Penalty"
+                )
             # Submit button
             submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")
             # Output components
             output_audio = gr.Audio(label="Generated Speech")
             output_message = gr.Textbox(label="Status", lines=4)
+            # Add warning about processing time
+            gr.Markdown("""
+            ⚠️ Note: Initial processing may take a few moments. Please be patient.
+            """)
     # Handle transcription button
+    def transcribe_audio(audio_path):
+        """Transcribe audio using Faster-Whisper tiny"""
+        try:
+            if not audio_path:
+                return "Please upload audio first."
+            segments, _ = ASR_MODEL.transcribe(
+                audio_path,
+                beam_size=1,
+                best_of=1,
+                temperature=1.0,
+                condition_on_previous_text=False,
+                compression_ratio_threshold=2.4,
+                log_prob_threshold=-1.0,
+                no_speech_threshold=0.6
+            )
+            text = " ".join([segment.text for segment in segments]).strip()
+            return text[:2000]  # Limit transcription length
+        except Exception as e:
+            return f"Error transcribing audio: {str(e)}"
     transcribe_btn.click(
+        fn=transcribe_audio,
         inputs=[audio_input],
         outputs=[reference_text],
     )
     gr.Markdown("""
     ### Tips for best results:
+    1. Use clear, short audio samples (5-15 seconds is ideal)
+    2. Keep both reference and output text concise
+    3. Use lower temperature (0.1-0.2) for more stable output
+    4. Start with short phrases to test the voice
+    5. If generation fails, try:
+       - Using shorter text
+       - Reducing temperature
+       - Using clearer audio
+       - Simplifying the text
     """)
 if __name__ == "__main__":