import spaces
import gradio as gr
import torch
from transformers import VitsModel, VitsTokenizer, set_seed
title = """
"""
description = """
VITS is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. It is a conditional variational autoencoder (VAE) comprised of a posterior encoder, decoder, and conditional prior.
This demo showcases the official VITS checkpoints, trained on the [LJSpeech](https://huggingface.co/kakao-enterprise/vits-ljs) and [VCTK](https://huggingface.co/kakao-enterprise/vits-vctk) datasets.
"""
article = "Model by Jaehyeon Kim et al. from Kakao Enterprise. Code and demo by 🤗 Hugging Face."
ljs_model = VitsModel.from_pretrained("kakao-enterprise/vits-ljs")
ljs_tokenizer = VitsTokenizer.from_pretrained("kakao-enterprise/vits-ljs")
vctk_model = VitsModel.from_pretrained("kakao-enterprise/vits-vctk")
vctk_tokenizer = VitsTokenizer.from_pretrained("kakao-enterprise/vits-vctk")
device = "cuda" if torch.cuda.is_available() else "cpu"
ljs_model.to(device)
vctk_model.to(device)
@spaces.GPU
def ljs_forward(text, speaking_rate=1.0):
inputs = ljs_tokenizer(text, return_tensors="pt")
inputs = inputs.to(ljs_model.device)
ljs_model.speaking_rate = speaking_rate
set_seed(555)
with torch.no_grad():
outputs = ljs_model(**inputs)[0]
waveform = outputs[0].cpu().float().numpy()
return gr.make_waveform((22050, waveform))
@spaces.GPU
def vctk_forward(text, speaking_rate=1.0, speaker_id=1):
inputs = vctk_tokenizer(text, return_tensors="pt")
inputs = inputs.to(vctk_model.device)
vctk_model.speaking_rate = speaking_rate
set_seed(555)
with torch.no_grad():
outputs = vctk_model(**inputs, speaker_id=speaker_id - 1)[0]
waveform = outputs[0].cpu().float().numpy()
return gr.make_waveform((22050, waveform))
ljs_inference = gr.Interface(
fn=ljs_forward,
inputs=[
gr.Textbox(
value="Hey, it's Hugging Face on the phone",
max_lines=1,
label="Input text",
),
gr.Slider(
0.5,
1.5,
value=1,
step=0.1,
label="Speaking rate",
),
],
outputs=gr.Audio(),
)
vctk_inference = gr.Interface(
fn=vctk_forward,
inputs=[
gr.Textbox(
value="Hey, it's Hugging Face on the phone",
max_lines=1,
label="Input text",
),
gr.Slider(
0.5,
1.5,
value=1,
step=0.1,
label="Speaking rate",
),
gr.Slider(
1,
vctk_model.config.num_speakers,
value=1,
step=1,
label="Speaker id",
info=f"The VCTK model is trained on {vctk_model.config.num_speakers} speakers. You can prompt the model using one of these speaker ids.",
),
],
outputs=gr.Audio(),
)
demo = gr.Blocks()
with demo:
gr.Markdown(title)
gr.Markdown(description)
gr.TabbedInterface([ljs_inference, vctk_inference], ["LJ Speech", "VCTK"])
gr.Markdown(article)
demo.queue(max_size=10)
demo.launch()