Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

File size: 5,846 Bytes

5a39a85
 
 
 
 
5dbc09c
30aecac
5a39a85
30aecac
5dbc09c
 
 
30aecac
 
5dbc09c
 
 
30aecac
 
5dbc09c
 
 
30aecac
 
 
5dbc09c
 
 
 
 
 
 
 
 
30aecac
5dbc09c
30aecac
 
 
 
 
5a39a85
 
 
30aecac
 
5dbc09c
30aecac
 
5dbc09c
5a39a85
5dbc09c
 
 
 
5a39a85
5dbc09c
 
 
 
 
 
 
 
30aecac
5dbc09c
5a39a85
 
30aecac
5a39a85
 
 
 
 
7ce428c
5dbc09c
7ce428c
5dbc09c
7ce428c
 
5dbc09c
7ce428c
 
 
 
5dbc09c
 
30aecac
5dbc09c
30aecac
 
 
5dbc09c
30aecac
 
5a39a85
7ce428c
5dbc09c
 
 
 
7ce428c
5dbc09c
7ce428c
5a39a85
7ce428c
5dbc09c
7ce428c
30aecac
7ce428c
5dbc09c
7ce428c
 
 
 
 
5a39a85
7ce428c
 
5dbc09c
 
 
 
 
 
 
7ce428c
5a39a85
 
7ce428c

import gradio as gr
import torch
from outetts.v0_1.interface import InterfaceHF
import soundfile as sf
import tempfile
import os
from faster_whisper import WhisperModel

def initialize_models():
    """Initialize the OuteTTS and Faster-Whisper models"""
    tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
    # Use tiny model with lowest compute settings for maximum speed
    asr_model = WhisperModel("tiny", 
                            device="cpu",
                            compute_type="int8",  # Use int8 quantization for efficiency
                            num_workers=1,        # Limit workers for low-resource environment
                            cpu_threads=1)        # Limit CPU threads
    return tts_interface, asr_model

# Initialize models globally to avoid reloading
TTS_INTERFACE, ASR_MODEL = initialize_models()

def transcribe_audio(audio_path):
    """Transcribe audio using Faster-Whisper tiny"""
    try:
        # Transcribe with minimal settings for speed
        segments, _ = ASR_MODEL.transcribe(audio_path, 
                                         beam_size=1,           # Reduce beam size
                                         best_of=1,             # Don't generate alternatives
                                         temperature=1.0,       # No temperature sampling
                                         condition_on_previous_text=False,  # Don't condition on previous
                                         compression_ratio_threshold=2.4,   # Less strict threshold
                                         log_prob_threshold=-1.0,          # Less strict threshold
                                         no_speech_threshold=0.6)          # Less strict threshold
        
        # Combine all segments
        text = " ".join([segment.text for segment in segments]).strip()
        return text
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
    """Process the audio file and generate speech with the cloned voice"""
    try:
        # If no reference text provided, transcribe the audio
        if not reference_text.strip():
            reference_text = transcribe_audio(audio_path)
            if reference_text.startswith("Error"):
                return None, reference_text
            
        # Create speaker from reference audio
        speaker = TTS_INTERFACE.create_speaker(
            audio_path,
            reference_text
        )
        
        # Generate speech with cloned voice
        output = TTS_INTERFACE.generate(
            text=text_to_speak,
            speaker=speaker,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_lenght=4096
        )
        
        # Save to temporary file and return path
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        output.save(temp_file.name)
        return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
    gr.Markdown("# 🎙️ Voice Cloning with OuteTTS")
    gr.Markdown("""
    This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
    and enter the new text you want to be spoken in the cloned voice.
    
    Note: For best results, use clear audio with minimal background noise.
    """)
    
    with gr.Row():
        with gr.Column():
            # Input components
            audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
            reference_text = gr.Textbox(
                label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
            )
            text_to_speak = gr.Textbox(
                label="Text to Speak (what you want the cloned voice to say)",
                placeholder="Enter the text you want the cloned voice to speak"
            )
            
            with gr.Row():
                temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, 
                                     label="Temperature (higher = more variation)")
                repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
                                             label="Repetition Penalty")
            
            # Submit button
            submit_btn = gr.Button("Generate Voice", variant="primary")
        
        with gr.Column():
            # Output components
            output_audio = gr.Audio(label="Generated Speech")
            output_message = gr.Textbox(label="Status", max_lines=3)
    
    # Handle submission
    submit_btn.click(
        fn=process_audio_file,
        inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
        outputs=[output_audio, output_message]
    )
    
    gr.Markdown("""
    ### Tips for best results:
    1. Use high-quality reference audio (clear speech, minimal background noise)
    2. If providing reference text manually, ensure it matches the audio exactly
    3. If using auto-transcription, verify the transcribed text in the status message
    4. Keep generated text relatively short for better quality
    5. Adjust temperature and repetition penalty if needed:
       - Lower temperature (0.1-0.3) for more consistent output
       - Higher repetition penalty (1.1-1.3) to avoid repetition
    """)

if __name__ == "__main__":
    demo.launch()