import os import gradio as gr import torch from styletts2importable import compute_style, device, inference from txtsplit import txtsplit import numpy as np import phonemizer theme = gr.themes.Base( font=[ gr.themes.GoogleFont("Libre Franklin"), gr.themes.GoogleFont("Public Sans"), "system-ui", "sans-serif", ], ) voicelist = [ "f-us-1", "f-us-2", "f-us-3", "f-us-4", "m-us-1", "m-us-2", "m-us-3", "m-us-4", ] voices = {} global_phonemizer = phonemizer.backend.EspeakBackend( language="en-us", preserve_punctuation=True, with_stress=True ) # else: for v in voicelist: cache_path = f"voices/{v}.wav.npy" if os.path.exists(cache_path): voices[v] = torch.from_numpy(np.load(cache_path)).to(device) else: style = compute_style(f"voices/{v}.wav") voices[v] = style np.save(cache_path, style.cpu().numpy()) def synthesize(text, voice, lngsteps): if text.strip() == "": raise gr.Error("You must enter some text") if len(text) > 50000: raise gr.Error("Text must be <50k characters") print("*** saying ***") print(text) print("*** end ***") texts = txtsplit(text) v = voice.lower() audios = [] for t in texts: audios.append( inference( t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1, ) ) return (24000, np.concatenate(audios)) with gr.Blocks() as vctk: with gr.Row(): with gr.Column(scale=1): inp = gr.Textbox( label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, ) voice = gr.Dropdown( voicelist, label="Voice", info="Select a default voice.", value="m-us-2", interactive=True, ) multispeakersteps = gr.Slider( minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True, ) # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental") with gr.Column(scale=1): btn = gr.Button("Synthesize", variant="primary") audio = gr.Audio( interactive=False, label="Synthesized Audio", waveform_options={"waveform_progress_color": "#3C82F6"}, ) btn.click( synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4, ) with gr.Blocks( title="StyleTTS 2", css="footer{display:none !important}", theme=theme ) as demo: gr.TabbedInterface( [vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"] ) if __name__ == "__main__": # demo.queue(api_open=False, max_size=15).launch(show_api=False) print("Launching") # start_time = time.time() # synthesize( # "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.", # "m-us-2", # 3, # ) # print(f"Launched in {time.time() - start_time} seconds") # second_start_time = time.time() # synthesize( # "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.", # "m-us-2", # 3, # ) # print(f"Launched in {time.time() - second_start_time} seconds") demo.queue(api_open=True, max_size=None).launch(show_api=False) print("Launched")