|
import numpy as np |
|
import onnxruntime |
|
|
|
import utils |
|
from text import text_to_sequence, sequence_to_text |
|
import torch |
|
import gradio as gr |
|
import soundfile as sf |
|
import tempfile |
|
import yaml |
|
import json |
|
|
|
from time import perf_counter |
|
|
|
def intersperse(lst, item): |
|
result = [item] * (len(lst) * 2 + 1) |
|
result[1::2] = lst |
|
return result |
|
|
|
|
|
def process_text(i: int, text: str, device: torch.device): |
|
print(f"[{i}] - Input text: {text}") |
|
x = torch.tensor( |
|
intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0), |
|
dtype=torch.long, |
|
device=device, |
|
)[None] |
|
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) |
|
x_phones = sequence_to_text(x.squeeze(0).tolist()) |
|
print(x_phones) |
|
return x.numpy(), x_lengths.numpy() |
|
|
|
MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps_lastwords.onnx" |
|
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx" |
|
MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx" |
|
CONFIG_PATH="config.yaml" |
|
SPEAKER_ID_DICT="spk_to_id.json" |
|
|
|
sess_options = onnxruntime.SessionOptions() |
|
model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"]) |
|
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"]) |
|
|
|
|
|
speaker_id_dict = json.load(open(SPEAKER_ID_DICT)) |
|
speakers = [sp for sp in speaker_id_dict.keys()] |
|
speakers.sort() |
|
|
|
def vocos_inference(mel,denoise): |
|
|
|
with open(CONFIG_PATH, "r") as f: |
|
config = yaml.safe_load(f) |
|
|
|
params = config["feature_extractor"]["init_args"] |
|
sample_rate = params["sample_rate"] |
|
n_fft= params["n_fft"] |
|
hop_length= params["hop_length"] |
|
win_length = n_fft |
|
|
|
|
|
mag, x, y = model_vocos.run( |
|
None, |
|
{ |
|
"mels": mel |
|
}, |
|
) |
|
|
|
|
|
spectrogram = mag * (x + 1j * y) |
|
window = torch.hann_window(win_length) |
|
|
|
if denoise: |
|
|
|
mel_rand = torch.zeros_like(torch.tensor(mel)) |
|
mag_bias, x_bias, y_bias = model_vocos.run( |
|
None, |
|
{ |
|
"mels": mel_rand.float().numpy() |
|
}, |
|
) |
|
|
|
|
|
spectrogram_bias = mag_bias * (x_bias + 1j * y_bias) |
|
|
|
|
|
spec = torch.view_as_real(torch.tensor(spectrogram)) |
|
|
|
mag_spec = torch.sqrt(spec.pow(2).sum(-1)) |
|
|
|
|
|
spec_bias = torch.view_as_real(torch.tensor(spectrogram_bias)) |
|
mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1)) |
|
|
|
|
|
strength = 0.0025 |
|
mag_spec_denoised = mag_spec - mag_spec_bias * strength |
|
mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0) |
|
|
|
|
|
angle = torch.atan2(spec[..., -1], spec[..., 0] ) |
|
spectrogram = torch.complex(mag_spec_denoised * torch.cos(angle), mag_spec_denoised * torch.sin(angle)) |
|
|
|
|
|
pad = (win_length - hop_length) // 2 |
|
spectrogram = torch.tensor(spectrogram) |
|
B, N, T = spectrogram.shape |
|
|
|
print("Spectrogram synthesized shape", spectrogram.shape) |
|
|
|
ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward") |
|
ifft = ifft * window[None, :, None] |
|
|
|
|
|
output_size = (T - 1) * hop_length + win_length |
|
y = torch.nn.functional.fold( |
|
ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length), |
|
)[:, 0, 0, pad:-pad] |
|
|
|
|
|
window_sq = window.square().expand(1, T, -1).transpose(1, 2) |
|
window_envelope = torch.nn.functional.fold( |
|
window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length), |
|
).squeeze()[pad:-pad] |
|
|
|
|
|
assert (window_envelope > 1e-11).all() |
|
y = y / window_envelope |
|
|
|
return y |
|
|
|
|
|
def tts(text:str, spk_name:str, temperature:float, length_scale:float, denoise:bool): |
|
spk_id = speaker_id_dict[spk_name] |
|
sid = np.array([int(spk_id)]) if spk_id is not None else None |
|
text_matcha , text_lengths = process_text(0,text,"cpu") |
|
|
|
|
|
inputs = { |
|
"x": text_matcha, |
|
"x_lengths": text_lengths, |
|
"scales": np.array([temperature, length_scale], dtype=np.float32), |
|
"spks": sid |
|
} |
|
mel_t0 = perf_counter() |
|
|
|
mel, mel_lengths = model_matcha_mel.run(None, inputs) |
|
mel_infer_secs = perf_counter() - mel_t0 |
|
print("Matcha Mel inference time", mel_infer_secs) |
|
|
|
vocos_t0 = perf_counter() |
|
|
|
wavs_vocos = vocos_inference(mel,denoise) |
|
vocos_infer_secs = perf_counter() - vocos_t0 |
|
print("Vocos inference time", vocos_infer_secs) |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos: |
|
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24") |
|
|
|
|
|
|
|
inputs = { |
|
"x": text_matcha, |
|
"x_lengths": text_lengths, |
|
"scales": np.array([temperature, length_scale], dtype=np.float32), |
|
"spks": sid |
|
} |
|
hifigan_t0 = perf_counter() |
|
print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }") |
|
return fp_matcha_vocos.name |
|
|
|
|
|
|
|
title = """ |
|
<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;" |
|
> <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;"> |
|
TTS Vocoder Comparison |
|
</h1> </div> |
|
</div> |
|
""" |
|
|
|
description = """ |
|
|
|
🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis |
|
|
|
For vocoders we use Hifigan universal version and Vocos trained in a catalan set of ~28 hours. |
|
|
|
Matcha was trained using openslr69 and festcat datasets |
|
""" |
|
|
|
article = "Training and demo by BSC." |
|
|
|
vits2_inference = gr.Interface( |
|
fn=tts, |
|
inputs=[ |
|
gr.Textbox( |
|
value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.", |
|
max_lines=1, |
|
label="Input text", |
|
), |
|
gr.Dropdown( |
|
choices=speakers, |
|
label="Speaker id", |
|
value='caf_09204', |
|
info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids." |
|
), |
|
gr.Slider( |
|
0.1, |
|
2.0, |
|
value=0.667, |
|
step=0.01, |
|
label="Temperature", |
|
info=f"Temperature", |
|
), |
|
gr.Slider( |
|
0.5, |
|
2.0, |
|
value=1.0, |
|
step=0.01, |
|
label="Length scale", |
|
info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace", |
|
), |
|
gr.Checkbox(label="Denoise", info="Removes model bias from vocos", value=True), |
|
], |
|
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath")] |
|
) |
|
|
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown(title) |
|
gr.Markdown(description) |
|
gr.TabbedInterface([vits2_inference], ["Multispeaker"]) |
|
gr.Markdown(article) |
|
|
|
demo.queue(max_size=10) |
|
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860) |
|
|