Kevin676's picture
Update app.py
555e118
import os
from glob import glob
from loguru import logger
import soundfile as sf
import librosa
import gradio as gr
from huggingface_hub import hf_hub_download
import time
import torch
import yaml
from s3prl_vc.upstream.interface import get_upstream
from s3prl.nn import Featurizer
import s3prl_vc.models
from s3prl_vc.utils import read_hdf5
from s3prl_vc.vocoder import Vocoder
# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
SERVER_PORT = 42208
SERVER_NAME = "0.0.0.0"
SSL_DIR = './keyble_ssl'
EXAMPLE_DIR = './examples'
en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav')))
jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav')))
zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav')))
TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"]
ref_samples = {
trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav')))
for trgspk in TRGSPKS
}
# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')
# ---------- Download models ----------
logger.info('============================= Download models ===========================')
vocoder_paths = {
"ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"),
"config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"),
"stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5")
}
vc_model_paths = {
trgspk: {
"ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"),
"config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"),
"stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"),
} for trgspk in TRGSPKS
}
# ---------- Model ----------
vc_models = {}
for trgspk in TRGSPKS:
logger.info(f'============================= Setting up model for {trgspk} =============')
checkpoint_path = vc_model_paths[trgspk]["ckpt"]
config_path = vc_model_paths[trgspk]["config"]
stats_path = vc_model_paths[trgspk]["stats"]
with open(config_path) as f:
config = yaml.load(f, Loader=yaml.Loader)
config["trg_stats"] = {
"mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE),
"scale": torch.from_numpy(read_hdf5(stats_path, "scale"))
.float()
.to(DEVICE),
}
# define upstream model
upstream_model = get_upstream(config["upstream"]).to(DEVICE)
upstream_model.eval()
upstream_featurizer = Featurizer(upstream_model).to(DEVICE)
upstream_featurizer.load_state_dict(
torch.load(checkpoint_path, map_location="cpu")["featurizer"]
)
upstream_featurizer.eval()
# get model and load parameters
model_class = getattr(s3prl_vc.models, config["model_type"])
model = model_class(
upstream_featurizer.output_size,
config["num_mels"],
config["sampling_rate"]
/ config["hop_size"]
* upstream_featurizer.downsample_rate
/ 16000,
config["trg_stats"],
use_spemb=config.get("use_spk_emb", False),
**config["model_params"],
).to(DEVICE)
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"])
model = model.eval().to(DEVICE)
logger.info(f"Loaded model parameters from {checkpoint_path}.")
# load vocoder
vocoder = Vocoder(
vocoder_paths["ckpt"],
vocoder_paths["config"],
vocoder_paths["stats"],
config["trg_stats"],
DEVICE,
)
vc_models[trgspk] = {
"upstream": upstream_model,
"featurizer": upstream_featurizer,
"decoder": model,
"vocoder": vocoder
}
def predict(trgspk, wav_file):
x, fs = librosa.load(wav_file, sr=16000)
logger.info('wav file loaded')
with torch.no_grad():
start_time = time.time()
xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE)
ilens = torch.LongTensor([x.shape[0]]).to(DEVICE)
all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens)
logger.info('upstream done')
hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens)
logger.info('featurizer done')
outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None)
logger.info('downstream done')
out = outs[0]
y, sr = vc_models[trgspk]["vocoder"].decode(out)
logger.info('vocoder done')
sf.write(
"out.wav",
y.cpu().numpy(),
24000,
"PCM_16",
)
logger.info('write done')
logger.info('RTF={}'.format(
(time.time() - start_time) / (len(x) / 16000)
))
return "out.wav"
with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo:
gr.Markdown(
"""
# S3PRL-VC: Any-to-one voice conversion demo on VCC2020
### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc)
**S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training.
In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like.
The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output.
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("## Upload a .wav file here!")
input_wav = gr.Audio(label="Source speech", source='upload', type='filepath')
gr.Markdown("## Select a target speaker!")
trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"])
gr.Markdown("### Here is what the target speaker sounds like!")
ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath")
ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath")
trgspk.change(lambda trgspk: ref_samples[trgspk],
inputs = trgspk,
outputs = [ref_sample_wav1, ref_sample_wav2]
)
convert_btn = gr.Button(value="Convert!")
gr.Markdown("### You can use these examples if using a microphone is too troublesome!")
gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.")
gr.Examples(
examples=en_examples,
inputs=input_wav,
label="English examples"
)
gr.Examples(
examples=jp_examples,
inputs=input_wav,
label="Japanese examples"
)
gr.Examples(
examples=zh_examples,
inputs=input_wav,
label="Mandarin examples"
)
with gr.Column():
gr.Markdown("## Listen to the converted speech here!")
output_wav = gr.Audio(type="filepath", label="Converted speech")
convert_btn.click(predict, [trgspk, input_wav], output_wav)
if __name__ == '__main__':
try:
demo.launch(debug=True,
enable_queue=True,
)
except KeyboardInterrupt as e:
print(e)
finally:
demo.close()