File size: 8,976 Bytes
e44094d
 
4e4528b
153c25e
a2dc963
 
 
 
e44094d
4e4528b
 
 
e44094d
4e4528b
 
 
 
 
 
 
 
6ca77a8
a2dc963
 
 
 
 
 
 
 
 
153c25e
4e4528b
 
153c25e
4e4528b
 
 
 
 
153c25e
 
4e4528b
 
 
 
153c25e
4e4528b
 
153c25e
4e4528b
 
 
 
153c25e
 
4e4528b
 
 
 
a2dc963
4e4528b
 
 
 
 
 
 
6ca77a8
a2dc963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153c25e
a2dc963
153c25e
 
a2dc963
153c25e
 
 
 
a2dc963
153c25e
 
a2dc963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153c25e
a2dc963
 
 
 
 
153c25e
 
 
 
 
4e4528b
6ca77a8
4e4528b
 
 
 
153c25e
4e4528b
 
a2dc963
4e4528b
 
 
6ca77a8
153c25e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2dc963
153c25e
 
 
 
 
 
a2dc963
153c25e
a2dc963
 
 
 
 
 
153c25e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e4528b
153c25e
 
 
a2dc963
 
153c25e
4e4528b
153c25e
 
 
 
 
 
4e4528b
 
 
 
 
 
153c25e
4e4528b
 
 
a2dc963
4e4528b
 
e44094d
4e4528b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import gradio as gr
from outetts.v0_1.interface import InterfaceHF
import logging
import os
import tempfile

# Import faster-whisper for transcription
from faster_whisper import WhisperModel

# Configure logging to display information in the terminal
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the OuteTTS interface with the Hugging Face model
try:
    logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'")
    interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
    logger.info("Model loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load model: {e}")
    raise e

# Initialize the faster-whisper model
try:
    logger.info("Initializing faster-whisper model for transcription.")
    whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
    logger.info("faster-whisper model loaded successfully.")
except Exception as e:
    logger.error(f"Failed to load faster-whisper model: {e}")
    raise e

def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
    """
    Generates speech from the input text using the OuteTTS model.

    Parameters:
        text (str): The input text for TTS.
        temperature (float): Sampling temperature.
        repetition_penalty (float): Repetition penalty.
        max_length (int): Maximum length of the generated audio tokens.
        speaker (dict): Speaker configuration for voice cloning.

    Returns:
        str: Path to the generated audio file.
    """
    logger.info("Received TTS generation request.")
    logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}")
    
    try:
        # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
        output = interface.generate(
            text=text,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_lenght=max_length,  # Pass the parameter with typo
            speaker=speaker
        )
        logger.info("TTS generation complete.")
        
        # Save the output to a temporary WAV file
        output_path = os.path.join(tempfile.gettempdir(), "output.wav")
        output.save(output_path)
        logger.info(f"Audio saved to {output_path}")
        
        return output_path  # Gradio will handle the audio playback
    except Exception as e:
        logger.error(f"Error during TTS generation: {e}")
        return None

def transcribe_audio(audio_path):
    """
    Transcribes the given audio file using faster-whisper.

    Parameters:
        audio_path (str): Path to the audio file.

    Returns:
        str: Transcribed text.
    """
    logger.info(f"Transcribing audio file: {audio_path}")
    segments, info = whisper_model.transcribe(audio_path)
    transcript = " ".join([segment.text for segment in segments])
    logger.info(f"Transcription complete: {transcript}")
    return transcript

def create_speaker_with_transcription(audio_file):
    """
    Creates a custom speaker from a reference audio file by automatically transcribing it.

    Parameters:
        audio_file (file): Uploaded reference audio file.

    Returns:
        dict: Speaker configuration.
    """
    logger.info("Received Voice Cloning request with audio file.")
    
    try:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
            temp_audio_path = temp_audio.name
            # Save uploaded audio to temporary file
            with open(temp_audio_path, "wb") as f:
                f.write(audio_file.read())
            logger.info(f"Reference audio saved to {temp_audio_path}")
        
        # Transcribe the audio file
        transcript = transcribe_audio(temp_audio_path)
        
        if not transcript.strip():
            logger.error("Transcription resulted in empty text.")
            return None
        
        # Create speaker using the transcribed text
        speaker = interface.create_speaker(temp_audio_path, transcript)
        logger.info("Speaker created successfully.")
        
        # Clean up the temporary audio file
        os.remove(temp_audio_path)
        logger.info(f"Temporary audio file {temp_audio_path} removed.")
        
        return speaker
    except Exception as e:
        logger.error(f"Error during speaker creation: {e}")
        return None

# Define the Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown("# 🎀 OuteTTS - Text to Speech Interface")
    gr.Markdown(
        """
        Generate speech from text using the **OuteTTS-0.1-350M** model.

        **Key Features:**
        - Pure language modeling approach to TTS
        - Voice cloning capabilities with automatic transcription
        - Compatible with LLaMa architecture
        """
    )

    with gr.Tab("Basic TTS"):
        with gr.Row():
            text_input = gr.Textbox(
                label="πŸ“„ Text Input",
                placeholder="Enter the text for TTS generation",
                lines=3
            )
        
        with gr.Row():
            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.1,
                step=0.01,
                label="🌑️ Temperature"
            )
            repetition_penalty = gr.Slider(
                minimum=0.5,
                maximum=2.0,
                value=1.1,
                step=0.1,
                label="πŸ” Repetition Penalty"
            )
            max_length = gr.Slider(
                minimum=256,
                maximum=4096,
                value=1024,
                step=256,
                label="πŸ“ Max Length"
            )
        
        generate_button = gr.Button("πŸ”Š Generate Speech")
        
        output_audio = gr.Audio(
            label="🎧 Generated Speech",
            type="filepath"  # Expecting a file path to the audio
        )
        
        # Define the button click event for Basic TTS
        generate_button.click(
            fn=generate_tts,
            inputs=[text_input, temperature, repetition_penalty, max_length, None],
            outputs=output_audio
        )
    
    with gr.Tab("Voice Cloning"):
        with gr.Row():
            reference_audio = gr.Audio(
                label="πŸ”Š Reference Audio",
                type="file",
                source="upload",
                optional=False
            )
        
        create_speaker_button = gr.Button("🎀 Create Speaker")
        
        speaker_info = gr.JSON(label="πŸ—‚οΈ Speaker Configuration", interactive=False)
        
        with gr.Row():
            generate_cloned_speech = gr.Textbox(
                label="πŸ“„ Text Input",
                placeholder="Enter the text for TTS generation with cloned voice",
                lines=3
            )
        
        with gr.Row():
            temperature_clone = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.1,
                step=0.01,
                label="🌑️ Temperature"
            )
            repetition_penalty_clone = gr.Slider(
                minimum=0.5,
                maximum=2.0,
                value=1.1,
                step=0.1,
                label="πŸ” Repetition Penalty"
            )
            max_length_clone = gr.Slider(
                minimum=256,
                maximum=4096,
                value=1024,
                step=256,
                label="πŸ“ Max Length"
            )
        
        generate_cloned_button = gr.Button("πŸ”Š Generate Cloned Speech")
        
        output_cloned_audio = gr.Audio(
            label="🎧 Generated Cloned Speech",
            type="filepath"  # Expecting a file path to the audio
        )
        
        # Define the button click event for creating a speaker
        create_speaker_button.click(
            fn=create_speaker_with_transcription,
            inputs=[reference_audio],
            outputs=speaker_info
        )
        
        # Define the button click event for generating speech with the cloned voice
        generate_cloned_button.click(
            fn=generate_tts,
            inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
            outputs=output_cloned_audio
        )
    
    gr.Markdown(
        """
        ---
        **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)

        **Credits:**
        - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
        - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
        - [faster-whisper](https://github.com/guillaumekln/faster-whisper)
        """
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()