File size: 7,516 Bytes
5a39a85
 
7c62735
5a39a85
 
5dbc09c
30aecac
7c62735
 
 
 
 
 
aac3370
7c62735
 
5a39a85
30aecac
5dbc09c
cc2340f
7c62735
cc2340f
 
 
 
 
 
 
7c62735
 
30aecac
 
7c62735
 
 
30aecac
 
5dbc09c
cc2340f
 
 
 
 
30aecac
5a39a85
 
 
30aecac
 
2d29569
5dbc09c
30aecac
 
5dbc09c
2d29569
cc2340f
 
 
 
2d29569
5a39a85
5dbc09c
 
cc2340f
5dbc09c
5a39a85
5dbc09c
 
cc2340f
5dbc09c
 
 
cc2340f
5dbc09c
30aecac
5dbc09c
5a39a85
 
2d29569
cc2340f
 
5a39a85
 
 
 
 
7c62735
 
7ce428c
cc2340f
7c62735
7ce428c
 
2d29569
 
cc2340f
 
 
7ce428c
 
 
 
5dbc09c
cc2340f
 
 
 
 
2d29569
 
 
30aecac
5dbc09c
2d29569
cc2340f
 
30aecac
 
cc2340f
 
2d29569
 
30aecac
5a39a85
7ce428c
cc2340f
 
 
 
 
 
 
 
 
 
 
 
 
 
7ce428c
5dbc09c
2d29569
5a39a85
7ce428c
5dbc09c
7ce428c
2d29569
cc2340f
 
 
 
 
2d29569
 
cc2340f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d29569
 
cc2340f
2d29569
 
 
7ce428c
2d29569
7ce428c
 
 
 
 
5a39a85
7ce428c
 
cc2340f
 
 
 
 
 
 
 
 
7ce428c
5a39a85
 
7ce428c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import gradio as gr
import torch
from outetts.v0_1.interface import InterfaceGGUF
import soundfile as sf
import tempfile
import os
from faster_whisper import WhisperModel
import huggingface_hub

def download_model():
    """Download the GGUF model from HuggingFace"""
    model_path = huggingface_hub.hf_hub_download(
        repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
        filename="OuteTTS-0.1-350M-Q6_K.gguf"
    )
    return model_path

def initialize_models():
    """Initialize the OuteTTS and Faster-Whisper models"""
    # Download and initialize GGUF model with adjusted parameters
    model_path = download_model()
    tts_interface = InterfaceGGUF(
        model_path,
        n_ctx=2048,          # Reduced context size
        n_batch=512,         # Reduced batch size
        n_threads=4,         # Adjust based on CPU
        verbose=False,       # Reduce logging
    )
    
    # Initialize Whisper
    asr_model = WhisperModel("tiny", 
                            device="cpu",
                            compute_type="int8",
                            num_workers=1,
                            cpu_threads=1)
    return tts_interface, asr_model

# Initialize models globally to avoid reloading
try:
    TTS_INTERFACE, ASR_MODEL = initialize_models()
except Exception as e:
    print(f"Error initializing models: {str(e)}")
    raise

def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
    """Process the audio file and generate speech with the cloned voice"""
    try:
        # If no reference text provided, transcribe the audio
        if not reference_text.strip():
            gr.Info("Transcribing audio...")
            reference_text = transcribe_audio(audio_path)
            if reference_text.startswith("Error"):
                return None, reference_text
            
        gr.Info(f"Using reference text: {reference_text}")
        
        # Limit text lengths to prevent context overflow
        reference_text = reference_text[:2000]  # Further reduced
        text_to_speak = text_to_speak[:300]     # Further reduced
            
        # Create speaker from reference audio
        speaker = TTS_INTERFACE.create_speaker(
            audio_path,
            reference_text,
        )
        
        # Generate speech with cloned voice
        output = TTS_INTERFACE.generate(
            text=text_to_speak,
            speaker=speaker,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_lenght=1024  # Reduced from 2048
        )
        
        # Save to temporary file and return path
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        output.save(temp_file.name)
        return temp_file.name, f"""Processing complete!
Reference text: {reference_text[:300]}... 
(Showing first 300 characters of reference text)"""
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
    gr.Markdown("# πŸŽ™οΈ Voice Cloning with OuteTTS (GGUF)")
    gr.Markdown("""
    This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file, 
    provide the text being spoken in that audio (or leave blank for automatic transcription),
    and enter the new text you want to be spoken in the cloned voice.
    
    Note: 
    - For best results, use clear audio with minimal background noise
    - Reference text is limited to 2000 characters
    - Output text is limited to 300 characters
    - Short inputs work best for quality results
    """)
    
    with gr.Row():
        with gr.Column():
            # Input components
            audio_input = gr.Audio(
                label="Upload Reference Audio", 
                type="filepath",
                max_length=30  # Limit audio length to 30 seconds
            )
            with gr.Row():
                transcribe_btn = gr.Button("πŸ“ Transcribe Audio", variant="secondary")
                
            reference_text = gr.Textbox(
                label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
                placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
                lines=3,
                max_lines=5
            )
            text_to_speak = gr.Textbox(
                label="Text to Speak (what you want the cloned voice to say, max 300 characters)",
                placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)",
                lines=3,
                max_lines=5
            )
            
            with gr.Row():
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=0.5,  # Reduced maximum temperature
                    value=0.1,
                    step=0.05,
                    label="Temperature (keep low for stability)"
                )
                repetition_penalty = gr.Slider(
                    minimum=1.0,
                    maximum=1.3,  # Reduced maximum
                    value=1.1,
                    step=0.05,
                    label="Repetition Penalty"
                )
            
            # Submit button
            submit_btn = gr.Button("πŸŽ™οΈ Generate Voice", variant="primary")
        
        with gr.Column():
            # Output components
            output_audio = gr.Audio(label="Generated Speech")
            output_message = gr.Textbox(label="Status", lines=4)
            
            # Add warning about processing time
            gr.Markdown("""
            ⚠️ Note: Initial processing may take a few moments. Please be patient.
            """)
    
    # Handle transcription button
    def transcribe_audio(audio_path):
        """Transcribe audio using Faster-Whisper tiny"""
        try:
            if not audio_path:
                return "Please upload audio first."
                
            segments, _ = ASR_MODEL.transcribe(
                audio_path,
                beam_size=1,
                best_of=1,
                temperature=1.0,
                condition_on_previous_text=False,
                compression_ratio_threshold=2.4,
                log_prob_threshold=-1.0,
                no_speech_threshold=0.6
            )
            
            text = " ".join([segment.text for segment in segments]).strip()
            return text[:2000]  # Limit transcription length
        except Exception as e:
            return f"Error transcribing audio: {str(e)}"
    
    transcribe_btn.click(
        fn=transcribe_audio,
        inputs=[audio_input],
        outputs=[reference_text],
    )
    
    # Handle main generation
    submit_btn.click(
        fn=process_audio_file,
        inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
        outputs=[output_audio, output_message]
    )
    
    gr.Markdown("""
    ### Tips for best results:
    1. Use clear, short audio samples (5-15 seconds is ideal)
    2. Keep both reference and output text concise
    3. Use lower temperature (0.1-0.2) for more stable output
    4. Start with short phrases to test the voice
    5. If generation fails, try:
       - Using shorter text
       - Reducing temperature
       - Using clearer audio
       - Simplifying the text
    """)

if __name__ == "__main__":
    demo.launch()