Spaces:
Running
Running
File size: 8,976 Bytes
e44094d 4e4528b 153c25e a2dc963 e44094d 4e4528b e44094d 4e4528b 6ca77a8 a2dc963 153c25e 4e4528b 153c25e 4e4528b 153c25e 4e4528b 153c25e 4e4528b 153c25e 4e4528b 153c25e 4e4528b a2dc963 4e4528b 6ca77a8 a2dc963 153c25e a2dc963 153c25e a2dc963 153c25e a2dc963 153c25e a2dc963 153c25e a2dc963 153c25e 4e4528b 6ca77a8 4e4528b 153c25e 4e4528b a2dc963 4e4528b 6ca77a8 153c25e a2dc963 153c25e a2dc963 153c25e a2dc963 153c25e 4e4528b 153c25e a2dc963 153c25e 4e4528b 153c25e 4e4528b 153c25e 4e4528b a2dc963 4e4528b e44094d 4e4528b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
import gradio as gr
from outetts.v0_1.interface import InterfaceHF
import logging
import os
import tempfile
# Import faster-whisper for transcription
from faster_whisper import WhisperModel
# Configure logging to display information in the terminal
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize the OuteTTS interface with the Hugging Face model
try:
logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'")
interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
logger.info("Model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise e
# Initialize the faster-whisper model
try:
logger.info("Initializing faster-whisper model for transcription.")
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
logger.info("faster-whisper model loaded successfully.")
except Exception as e:
logger.error(f"Failed to load faster-whisper model: {e}")
raise e
def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
"""
Generates speech from the input text using the OuteTTS model.
Parameters:
text (str): The input text for TTS.
temperature (float): Sampling temperature.
repetition_penalty (float): Repetition penalty.
max_length (int): Maximum length of the generated audio tokens.
speaker (dict): Speaker configuration for voice cloning.
Returns:
str: Path to the generated audio file.
"""
logger.info("Received TTS generation request.")
logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}")
try:
# Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
output = interface.generate(
text=text,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_lenght=max_length, # Pass the parameter with typo
speaker=speaker
)
logger.info("TTS generation complete.")
# Save the output to a temporary WAV file
output_path = os.path.join(tempfile.gettempdir(), "output.wav")
output.save(output_path)
logger.info(f"Audio saved to {output_path}")
return output_path # Gradio will handle the audio playback
except Exception as e:
logger.error(f"Error during TTS generation: {e}")
return None
def transcribe_audio(audio_path):
"""
Transcribes the given audio file using faster-whisper.
Parameters:
audio_path (str): Path to the audio file.
Returns:
str: Transcribed text.
"""
logger.info(f"Transcribing audio file: {audio_path}")
segments, info = whisper_model.transcribe(audio_path)
transcript = " ".join([segment.text for segment in segments])
logger.info(f"Transcription complete: {transcript}")
return transcript
def create_speaker_with_transcription(audio_file):
"""
Creates a custom speaker from a reference audio file by automatically transcribing it.
Parameters:
audio_file (file): Uploaded reference audio file.
Returns:
dict: Speaker configuration.
"""
logger.info("Received Voice Cloning request with audio file.")
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
temp_audio_path = temp_audio.name
# Save uploaded audio to temporary file
with open(temp_audio_path, "wb") as f:
f.write(audio_file.read())
logger.info(f"Reference audio saved to {temp_audio_path}")
# Transcribe the audio file
transcript = transcribe_audio(temp_audio_path)
if not transcript.strip():
logger.error("Transcription resulted in empty text.")
return None
# Create speaker using the transcribed text
speaker = interface.create_speaker(temp_audio_path, transcript)
logger.info("Speaker created successfully.")
# Clean up the temporary audio file
os.remove(temp_audio_path)
logger.info(f"Temporary audio file {temp_audio_path} removed.")
return speaker
except Exception as e:
logger.error(f"Error during speaker creation: {e}")
return None
# Define the Gradio Blocks interface
with gr.Blocks() as demo:
gr.Markdown("# π€ OuteTTS - Text to Speech Interface")
gr.Markdown(
"""
Generate speech from text using the **OuteTTS-0.1-350M** model.
**Key Features:**
- Pure language modeling approach to TTS
- Voice cloning capabilities with automatic transcription
- Compatible with LLaMa architecture
"""
)
with gr.Tab("Basic TTS"):
with gr.Row():
text_input = gr.Textbox(
label="π Text Input",
placeholder="Enter the text for TTS generation",
lines=3
)
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.01,
label="π‘οΈ Temperature"
)
repetition_penalty = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.1,
step=0.1,
label="π Repetition Penalty"
)
max_length = gr.Slider(
minimum=256,
maximum=4096,
value=1024,
step=256,
label="π Max Length"
)
generate_button = gr.Button("π Generate Speech")
output_audio = gr.Audio(
label="π§ Generated Speech",
type="filepath" # Expecting a file path to the audio
)
# Define the button click event for Basic TTS
generate_button.click(
fn=generate_tts,
inputs=[text_input, temperature, repetition_penalty, max_length, None],
outputs=output_audio
)
with gr.Tab("Voice Cloning"):
with gr.Row():
reference_audio = gr.Audio(
label="π Reference Audio",
type="file",
source="upload",
optional=False
)
create_speaker_button = gr.Button("π€ Create Speaker")
speaker_info = gr.JSON(label="ποΈ Speaker Configuration", interactive=False)
with gr.Row():
generate_cloned_speech = gr.Textbox(
label="π Text Input",
placeholder="Enter the text for TTS generation with cloned voice",
lines=3
)
with gr.Row():
temperature_clone = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.01,
label="π‘οΈ Temperature"
)
repetition_penalty_clone = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.1,
step=0.1,
label="π Repetition Penalty"
)
max_length_clone = gr.Slider(
minimum=256,
maximum=4096,
value=1024,
step=256,
label="π Max Length"
)
generate_cloned_button = gr.Button("π Generate Cloned Speech")
output_cloned_audio = gr.Audio(
label="π§ Generated Cloned Speech",
type="filepath" # Expecting a file path to the audio
)
# Define the button click event for creating a speaker
create_speaker_button.click(
fn=create_speaker_with_transcription,
inputs=[reference_audio],
outputs=speaker_info
)
# Define the button click event for generating speech with the cloned voice
generate_cloned_button.click(
fn=generate_tts,
inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
outputs=output_cloned_audio
)
gr.Markdown(
"""
---
**Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
**Credits:**
- [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
- [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
- [faster-whisper](https://github.com/guillaumekln/faster-whisper)
"""
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()
|