Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

File size: 6,648 Bytes

5a39a85
 
7c62735
5a39a85
 
5dbc09c
30aecac
7c62735
 
 
 
 
 
bcb6dc0
7c62735
 
5a39a85
30aecac
5dbc09c
7c62735
 
 
 
 
30aecac
 
7c62735
 
 
30aecac
 
5dbc09c
 
 
30aecac
 
 
5dbc09c
7c62735
 
 
2d29569
 
 
 
30aecac
 
 
 
 
 
5a39a85
 
 
30aecac
 
2d29569
5dbc09c
30aecac
 
5dbc09c
2d29569
 
5a39a85
5dbc09c
 
2d29569
5dbc09c
5a39a85
5dbc09c
 
2d29569
5dbc09c
 
 
2d29569
5dbc09c
30aecac
5dbc09c
5a39a85
 
2d29569
 
 
5a39a85
 
 
 
 
7c62735
 
7ce428c
7c62735
 
7ce428c
 
2d29569
 
 
 
7ce428c
 
 
 
5dbc09c
 
2d29569
 
 
30aecac
5dbc09c
2d29569
 
30aecac
 
2d29569
 
 
 
30aecac
5a39a85
7ce428c
5dbc09c
 
 
 
7ce428c
5dbc09c
2d29569
5a39a85
7ce428c
5dbc09c
7ce428c
2d29569
 
 
 
 
 
 
 
 
 
 
 
 
7ce428c
2d29569
7ce428c
 
 
 
 
5a39a85
7ce428c
 
5dbc09c
2d29569
 
 
5dbc09c
 
 
7ce428c
5a39a85
 
7ce428c

import gradio as gr
import torch
from outetts.v0_1.interface import InterfaceGGUF
import soundfile as sf
import tempfile
import os
from faster_whisper import WhisperModel
import huggingface_hub

def download_model():
    """Download the GGUF model from HuggingFace"""
    model_path = huggingface_hub.hf_hub_download(
        repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
        filename="OuteTTS-0.1-350M-Q6_K.gguf"
    )
    return model_path

def initialize_models():
    """Initialize the OuteTTS and Faster-Whisper models"""
    # Download and initialize GGUF model
    model_path = download_model()
    tts_interface = InterfaceGGUF(model_path)
    
    # Initialize Whisper
    asr_model = WhisperModel("tiny", 
                            device="cpu",
                            compute_type="int8",
                            num_workers=1,
                            cpu_threads=1)
    return tts_interface, asr_model

# Initialize models globally to avoid reloading
TTS_INTERFACE, ASR_MODEL = initialize_models()

def transcribe_audio(audio_path):
    """Transcribe audio using Faster-Whisper tiny"""
    try:
        segments, _ = ASR_MODEL.transcribe(audio_path, 
                                         beam_size=1,
                                         best_of=1,
                                         temperature=1.0,
                                         condition_on_previous_text=False,
                                         compression_ratio_threshold=2.4,
                                         log_prob_threshold=-1.0,
                                         no_speech_threshold=0.6)
        
        text = " ".join([segment.text for segment in segments]).strip()
        return text
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
    """Process the audio file and generate speech with the cloned voice"""
    try:
        # If no reference text provided, transcribe the audio
        if not reference_text.strip():
            gr.Info("Transcribing audio...")
            reference_text = transcribe_audio(audio_path)
            if reference_text.startswith("Error"):
                return None, reference_text
            
        gr.Info(f"Using reference text: {reference_text}")
            
        # Create speaker from reference audio
        speaker = TTS_INTERFACE.create_speaker(
            audio_path,
            reference_text[:4000]  # Limit reference text length
        )
        
        # Generate speech with cloned voice
        output = TTS_INTERFACE.generate(
            text=text_to_speak[:500],  # Limit output text length
            speaker=speaker,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_lenght=2048  # Reduced from 4096 to avoid errors
        )
        
        # Save to temporary file and return path
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        output.save(temp_file.name)
        return temp_file.name, f"""Processing complete!
Reference text: {reference_text[:500]}... 
(Showing first 500 characters of reference text)"""
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
    gr.Markdown("# 🎙️ Voice Cloning with OuteTTS (GGUF)")
    gr.Markdown("""
    This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file, 
    provide the text being spoken in that audio (or leave blank for automatic transcription),
    and enter the new text you want to be spoken in the cloned voice.
    
    Note: 
    - For best results, use clear audio with minimal background noise
    - Reference text is limited to 4000 characters
    - Output text is limited to 500 characters
    """)
    
    with gr.Row():
        with gr.Column():
            # Input components
            audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
            with gr.Row():
                transcribe_btn = gr.Button("📝 Transcribe Audio", variant="secondary")
                
            reference_text = gr.Textbox(
                label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
                lines=3
            )
            text_to_speak = gr.Textbox(
                label="Text to Speak (what you want the cloned voice to say, max 500 characters)",
                placeholder="Enter the text you want the cloned voice to speak",
                lines=3,
                max_lines=5
            )
            
            with gr.Row():
                temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, 
                                     label="Temperature (higher = more variation)")
                repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
                                             label="Repetition Penalty")
            
            # Submit button
            submit_btn = gr.Button("🎙️ Generate Voice", variant="primary")
        
        with gr.Column():
            # Output components
            output_audio = gr.Audio(label="Generated Speech")
            output_message = gr.Textbox(label="Status", lines=4)
    
    # Handle transcription button
    def transcribe_button(audio):
        if not audio:
            return "Please upload audio first."
        return transcribe_audio(audio)
    
    transcribe_btn.click(
        fn=transcribe_button,
        inputs=[audio_input],
        outputs=[reference_text],
    )
    
    # Handle main generation
    submit_btn.click(
        fn=process_audio_file,
        inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
        outputs=[output_audio, output_message]
    )
    
    gr.Markdown("""
    ### Tips for best results:
    1. Use high-quality reference audio (clear speech, minimal background noise)
    2. Try to keep reference audio under 30 seconds
    3. If auto-transcription isn't accurate, you can manually correct the text
    4. Keep generated text short for better quality
    5. Adjust temperature and repetition penalty if needed:
       - Lower temperature (0.1-0.3) for more consistent output
       - Higher repetition penalty (1.1-1.3) to avoid repetition
    """)

if __name__ == "__main__":
    demo.launch()