Spaces:
Sleeping
Sleeping
import os | |
from glob import glob | |
from loguru import logger | |
import soundfile as sf | |
import librosa | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
import time | |
import torch | |
import yaml | |
from s3prl_vc.upstream.interface import get_upstream | |
from s3prl.nn import Featurizer | |
import s3prl_vc.models | |
from s3prl_vc.utils import read_hdf5 | |
from s3prl_vc.vocoder import Vocoder | |
# ---------- Settings ---------- | |
GPU_ID = '-1' | |
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID | |
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu' | |
SERVER_PORT = 42208 | |
SERVER_NAME = "0.0.0.0" | |
SSL_DIR = './keyble_ssl' | |
EXAMPLE_DIR = './examples' | |
en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav'))) | |
jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav'))) | |
zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav'))) | |
TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"] | |
ref_samples = { | |
trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav'))) | |
for trgspk in TRGSPKS | |
} | |
# ---------- Logging ---------- | |
logger.add('app.log', mode='a') | |
logger.info('============================= App restarted =============================') | |
# ---------- Download models ---------- | |
logger.info('============================= Download models ===========================') | |
vocoder_paths = { | |
"ckpt": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="checkpoint-2500000steps.pkl"), | |
"config": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="config.yml"), | |
"stats": hf_hub_download(repo_id="unilight/hifigan_vctk_plus_vcc2020", filename="stats.h5") | |
} | |
vc_model_paths = { | |
trgspk: { | |
"ckpt": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/checkpoint-10000steps.pkl"), | |
"config": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/config.yml"), | |
"stats": hf_hub_download(repo_id="unilight/s3prl-vc-vcc2020", filename=f"{trgspk}/stats.h5"), | |
} for trgspk in TRGSPKS | |
} | |
# ---------- Model ---------- | |
vc_models = {} | |
for trgspk in TRGSPKS: | |
logger.info(f'============================= Setting up model for {trgspk} =============') | |
checkpoint_path = vc_model_paths[trgspk]["ckpt"] | |
config_path = vc_model_paths[trgspk]["config"] | |
stats_path = vc_model_paths[trgspk]["stats"] | |
with open(config_path) as f: | |
config = yaml.load(f, Loader=yaml.Loader) | |
config["trg_stats"] = { | |
"mean": torch.from_numpy(read_hdf5(stats_path, "mean")).float().to(DEVICE), | |
"scale": torch.from_numpy(read_hdf5(stats_path, "scale")) | |
.float() | |
.to(DEVICE), | |
} | |
# define upstream model | |
upstream_model = get_upstream(config["upstream"]).to(DEVICE) | |
upstream_model.eval() | |
upstream_featurizer = Featurizer(upstream_model).to(DEVICE) | |
upstream_featurizer.load_state_dict( | |
torch.load(checkpoint_path, map_location="cpu")["featurizer"] | |
) | |
upstream_featurizer.eval() | |
# get model and load parameters | |
model_class = getattr(s3prl_vc.models, config["model_type"]) | |
model = model_class( | |
upstream_featurizer.output_size, | |
config["num_mels"], | |
config["sampling_rate"] | |
/ config["hop_size"] | |
* upstream_featurizer.downsample_rate | |
/ 16000, | |
config["trg_stats"], | |
use_spemb=config.get("use_spk_emb", False), | |
**config["model_params"], | |
).to(DEVICE) | |
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]) | |
model = model.eval().to(DEVICE) | |
logger.info(f"Loaded model parameters from {checkpoint_path}.") | |
# load vocoder | |
vocoder = Vocoder( | |
vocoder_paths["ckpt"], | |
vocoder_paths["config"], | |
vocoder_paths["stats"], | |
config["trg_stats"], | |
DEVICE, | |
) | |
vc_models[trgspk] = { | |
"upstream": upstream_model, | |
"featurizer": upstream_featurizer, | |
"decoder": model, | |
"vocoder": vocoder | |
} | |
def predict(trgspk, wav_file): | |
x, fs = librosa.load(wav_file, sr=16000) | |
logger.info('wav file loaded') | |
with torch.no_grad(): | |
start_time = time.time() | |
xs = torch.from_numpy(x).unsqueeze(0).float().to(DEVICE) | |
ilens = torch.LongTensor([x.shape[0]]).to(DEVICE) | |
all_hs, all_hlens = vc_models[trgspk]["upstream"](xs, ilens) | |
logger.info('upstream done') | |
hs, hlens = vc_models[trgspk]["featurizer"](all_hs, all_hlens) | |
logger.info('featurizer done') | |
outs, _ = vc_models[trgspk]["decoder"](hs, hlens, spk_embs=None) | |
logger.info('downstream done') | |
out = outs[0] | |
y, sr = vc_models[trgspk]["vocoder"].decode(out) | |
logger.info('vocoder done') | |
sf.write( | |
"out.wav", | |
y.cpu().numpy(), | |
24000, | |
"PCM_16", | |
) | |
logger.info('write done') | |
logger.info('RTF={}'.format( | |
(time.time() - start_time) / (len(x) / 16000) | |
)) | |
return "out.wav" | |
with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo: | |
gr.Markdown( | |
""" | |
# S3PRL-VC: Any-to-one voice conversion demo on VCC2020 | |
### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc) | |
**S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training. | |
In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like. | |
The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## Upload a .wav file here!") | |
input_wav = gr.Audio(label="Source speech", source='upload', type='filepath') | |
gr.Markdown("## Select a target speaker!") | |
trgspk = gr.Radio(label="Target speaker", choices=["TEF1", "TEF2", "TEM1", "TEM2"]) | |
gr.Markdown("### Here is what the target speaker sounds like!") | |
ref_sample_wav1 = gr.Audio(label="Sample 1", type="filepath") | |
ref_sample_wav2 = gr.Audio(label="Sample 2", type="filepath") | |
trgspk.change(lambda trgspk: ref_samples[trgspk], | |
inputs = trgspk, | |
outputs = [ref_sample_wav1, ref_sample_wav2] | |
) | |
convert_btn = gr.Button(value="Convert!") | |
gr.Markdown("### You can use these examples if using a microphone is too troublesome!") | |
gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.") | |
gr.Examples( | |
examples=en_examples, | |
inputs=input_wav, | |
label="English examples" | |
) | |
gr.Examples( | |
examples=jp_examples, | |
inputs=input_wav, | |
label="Japanese examples" | |
) | |
gr.Examples( | |
examples=zh_examples, | |
inputs=input_wav, | |
label="Mandarin examples" | |
) | |
with gr.Column(): | |
gr.Markdown("## Listen to the converted speech here!") | |
output_wav = gr.Audio(type="filepath", label="Converted speech") | |
convert_btn.click(predict, [trgspk, input_wav], output_wav) | |
if __name__ == '__main__': | |
try: | |
demo.launch(debug=True, | |
enable_queue=True, | |
) | |
except KeyboardInterrupt as e: | |
print(e) | |
finally: | |
demo.close() |