File size: 7,516 Bytes
5a39a85 7c62735 5a39a85 5dbc09c 30aecac 7c62735 aac3370 7c62735 5a39a85 30aecac 5dbc09c cc2340f 7c62735 cc2340f 7c62735 30aecac 7c62735 30aecac 5dbc09c cc2340f 30aecac 5a39a85 30aecac 2d29569 5dbc09c 30aecac 5dbc09c 2d29569 cc2340f 2d29569 5a39a85 5dbc09c cc2340f 5dbc09c 5a39a85 5dbc09c cc2340f 5dbc09c cc2340f 5dbc09c 30aecac 5dbc09c 5a39a85 2d29569 cc2340f 5a39a85 7c62735 7ce428c cc2340f 7c62735 7ce428c 2d29569 cc2340f 7ce428c 5dbc09c cc2340f 2d29569 30aecac 5dbc09c 2d29569 cc2340f 30aecac cc2340f 2d29569 30aecac 5a39a85 7ce428c cc2340f 7ce428c 5dbc09c 2d29569 5a39a85 7ce428c 5dbc09c 7ce428c 2d29569 cc2340f 2d29569 cc2340f 2d29569 cc2340f 2d29569 7ce428c 2d29569 7ce428c 5a39a85 7ce428c cc2340f 7ce428c 5a39a85 7ce428c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import gradio as gr
import torch
from outetts.v0_1.interface import InterfaceGGUF
import soundfile as sf
import tempfile
import os
from faster_whisper import WhisperModel
import huggingface_hub
def download_model():
"""Download the GGUF model from HuggingFace"""
model_path = huggingface_hub.hf_hub_download(
repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
filename="OuteTTS-0.1-350M-Q6_K.gguf"
)
return model_path
def initialize_models():
"""Initialize the OuteTTS and Faster-Whisper models"""
# Download and initialize GGUF model with adjusted parameters
model_path = download_model()
tts_interface = InterfaceGGUF(
model_path,
n_ctx=2048, # Reduced context size
n_batch=512, # Reduced batch size
n_threads=4, # Adjust based on CPU
verbose=False, # Reduce logging
)
# Initialize Whisper
asr_model = WhisperModel("tiny",
device="cpu",
compute_type="int8",
num_workers=1,
cpu_threads=1)
return tts_interface, asr_model
# Initialize models globally to avoid reloading
try:
TTS_INTERFACE, ASR_MODEL = initialize_models()
except Exception as e:
print(f"Error initializing models: {str(e)}")
raise
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
"""Process the audio file and generate speech with the cloned voice"""
try:
# If no reference text provided, transcribe the audio
if not reference_text.strip():
gr.Info("Transcribing audio...")
reference_text = transcribe_audio(audio_path)
if reference_text.startswith("Error"):
return None, reference_text
gr.Info(f"Using reference text: {reference_text}")
# Limit text lengths to prevent context overflow
reference_text = reference_text[:2000] # Further reduced
text_to_speak = text_to_speak[:300] # Further reduced
# Create speaker from reference audio
speaker = TTS_INTERFACE.create_speaker(
audio_path,
reference_text,
)
# Generate speech with cloned voice
output = TTS_INTERFACE.generate(
text=text_to_speak,
speaker=speaker,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_lenght=1024 # Reduced from 2048
)
# Save to temporary file and return path
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
output.save(temp_file.name)
return temp_file.name, f"""Processing complete!
Reference text: {reference_text[:300]}...
(Showing first 300 characters of reference text)"""
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
gr.Markdown("# ποΈ Voice Cloning with OuteTTS (GGUF)")
gr.Markdown("""
This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file,
provide the text being spoken in that audio (or leave blank for automatic transcription),
and enter the new text you want to be spoken in the cloned voice.
Note:
- For best results, use clear audio with minimal background noise
- Reference text is limited to 2000 characters
- Output text is limited to 300 characters
- Short inputs work best for quality results
""")
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(
label="Upload Reference Audio",
type="filepath",
max_length=30 # Limit audio length to 30 seconds
)
with gr.Row():
transcribe_btn = gr.Button("π Transcribe Audio", variant="secondary")
reference_text = gr.Textbox(
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
lines=3,
max_lines=5
)
text_to_speak = gr.Textbox(
label="Text to Speak (what you want the cloned voice to say, max 300 characters)",
placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)",
lines=3,
max_lines=5
)
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=0.5, # Reduced maximum temperature
value=0.1,
step=0.05,
label="Temperature (keep low for stability)"
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=1.3, # Reduced maximum
value=1.1,
step=0.05,
label="Repetition Penalty"
)
# Submit button
submit_btn = gr.Button("ποΈ Generate Voice", variant="primary")
with gr.Column():
# Output components
output_audio = gr.Audio(label="Generated Speech")
output_message = gr.Textbox(label="Status", lines=4)
# Add warning about processing time
gr.Markdown("""
β οΈ Note: Initial processing may take a few moments. Please be patient.
""")
# Handle transcription button
def transcribe_audio(audio_path):
"""Transcribe audio using Faster-Whisper tiny"""
try:
if not audio_path:
return "Please upload audio first."
segments, _ = ASR_MODEL.transcribe(
audio_path,
beam_size=1,
best_of=1,
temperature=1.0,
condition_on_previous_text=False,
compression_ratio_threshold=2.4,
log_prob_threshold=-1.0,
no_speech_threshold=0.6
)
text = " ".join([segment.text for segment in segments]).strip()
return text[:2000] # Limit transcription length
except Exception as e:
return f"Error transcribing audio: {str(e)}"
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[reference_text],
)
# Handle main generation
submit_btn.click(
fn=process_audio_file,
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
outputs=[output_audio, output_message]
)
gr.Markdown("""
### Tips for best results:
1. Use clear, short audio samples (5-15 seconds is ideal)
2. Keep both reference and output text concise
3. Use lower temperature (0.1-0.2) for more stable output
4. Start with short phrases to test the voice
5. If generation fails, try:
- Using shorter text
- Reducing temperature
- Using clearer audio
- Simplifying the text
""")
if __name__ == "__main__":
demo.launch() |