File size: 3,481 Bytes
5a39a85 7ce428c 5a39a85 7ce428c 5a39a85 7ce428c 5a39a85 7ce428c 5a39a85 7ce428c 5a39a85 7ce428c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import torch
from outetts.v0_1.interface import InterfaceHF
import soundfile as sf
import tempfile
import os
def initialize_model():
"""Initialize the OuteTTS model"""
interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
return interface
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
"""Process the audio file and generate speech with the cloned voice"""
try:
# Initialize model
interface = initialize_model()
# Create speaker from reference audio
speaker = interface.create_speaker(
audio_path,
reference_text
)
# Generate speech with cloned voice
output = interface.generate(
text=text_to_speak,
speaker=speaker,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_lenght=4096
)
# Save to temporary file and return path
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
output.save(temp_file.name)
return temp_file.name, "Voice cloning successful!"
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
gr.Markdown("# ๐๏ธ Voice Cloning with OuteTTS")
gr.Markdown("""
This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio,
and enter the new text you want to be spoken in the cloned voice.
Note: For best results, use clear audio with minimal background noise.
""")
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
reference_text = gr.Textbox(label="Reference Text (what is being said in the audio)")
text_to_speak = gr.Textbox(label="Text to Speak (what you want the cloned voice to say)")
with gr.Row():
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
label="Temperature (higher = more variation)")
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
label="Repetition Penalty")
# Submit button
submit_btn = gr.Button("Generate Voice", variant="primary")
with gr.Column():
# Output components
output_audio = gr.Audio(label="Generated Speech")
output_message = gr.Textbox(label="Status")
# Handle submission
submit_btn.click(
fn=process_audio_file,
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
outputs=[output_audio, output_message]
)
gr.Markdown("""
### Tips for best results:
1. Use high-quality reference audio (clear speech, minimal background noise)
2. Ensure reference text matches the audio exactly
3. Keep generated text relatively short for better quality
4. Adjust temperature and repetition penalty if needed:
- Lower temperature (0.1-0.3) for more consistent output
- Higher repetition penalty (1.1-1.3) to avoid repetition
""")
if __name__ == "__main__":
demo.launch() |