|
import os |
|
|
|
import gradio as gr |
|
import torch |
|
from styletts2importable import compute_style, device, inference |
|
from txtsplit import txtsplit |
|
import numpy as np |
|
import phonemizer |
|
|
|
|
|
theme = gr.themes.Base( |
|
font=[ |
|
gr.themes.GoogleFont("Libre Franklin"), |
|
gr.themes.GoogleFont("Public Sans"), |
|
"system-ui", |
|
"sans-serif", |
|
], |
|
) |
|
voicelist = [ |
|
"f-us-1", |
|
"f-us-2", |
|
"f-us-3", |
|
"f-us-4", |
|
"m-us-1", |
|
"m-us-2", |
|
"m-us-3", |
|
"m-us-4", |
|
] |
|
voices = {} |
|
|
|
global_phonemizer = phonemizer.backend.EspeakBackend( |
|
language="en-us", preserve_punctuation=True, with_stress=True |
|
) |
|
|
|
for v in voicelist: |
|
cache_path = f"voices/{v}.wav.npy" |
|
if os.path.exists(cache_path): |
|
voices[v] = torch.from_numpy(np.load(cache_path)).to(device) |
|
else: |
|
style = compute_style(f"voices/{v}.wav") |
|
voices[v] = style |
|
np.save(cache_path, style.cpu().numpy()) |
|
|
|
|
|
def synthesize(text, voice, lngsteps): |
|
if text.strip() == "": |
|
raise gr.Error("You must enter some text") |
|
if len(text) > 50000: |
|
raise gr.Error("Text must be <50k characters") |
|
print("*** saying ***") |
|
print(text) |
|
print("*** end ***") |
|
texts = txtsplit(text) |
|
v = voice.lower() |
|
audios = [] |
|
for t in texts: |
|
audios.append( |
|
inference( |
|
t, |
|
voices[v], |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=lngsteps, |
|
embedding_scale=1, |
|
) |
|
) |
|
return (24000, np.concatenate(audios)) |
|
|
|
|
|
with gr.Blocks() as vctk: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
inp = gr.Textbox( |
|
label="Text", |
|
info="What would you like StyleTTS 2 to read? It works better on full sentences.", |
|
interactive=True, |
|
) |
|
voice = gr.Dropdown( |
|
voicelist, |
|
label="Voice", |
|
info="Select a default voice.", |
|
value="m-us-2", |
|
interactive=True, |
|
) |
|
multispeakersteps = gr.Slider( |
|
minimum=3, |
|
maximum=15, |
|
value=3, |
|
step=1, |
|
label="Diffusion Steps", |
|
info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", |
|
interactive=True, |
|
) |
|
|
|
with gr.Column(scale=1): |
|
btn = gr.Button("Synthesize", variant="primary") |
|
audio = gr.Audio( |
|
interactive=False, |
|
label="Synthesized Audio", |
|
waveform_options={"waveform_progress_color": "#3C82F6"}, |
|
) |
|
btn.click( |
|
synthesize, |
|
inputs=[inp, voice, multispeakersteps], |
|
outputs=[audio], |
|
concurrency_limit=4, |
|
) |
|
|
|
with gr.Blocks( |
|
title="StyleTTS 2", css="footer{display:none !important}", theme=theme |
|
) as demo: |
|
gr.TabbedInterface( |
|
[vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"] |
|
) |
|
if __name__ == "__main__": |
|
|
|
print("Launching") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.queue(api_open=True, max_size=None).launch(show_api=False) |
|
print("Launched") |
|
|