|
--- |
|
language: |
|
- ms |
|
--- |
|
|
|
# Malay VITS Multispeaker clean V2 |
|
|
|
**This model intended to use by [malaya-speech](https://github.com/mesolitica/malaya-speech) only, it is possible to not use the library but make sure the character vocabulary is correct**. |
|
|
|
## how to |
|
|
|
```python |
|
from huggingface_hub import snapshot_download |
|
from malaya_speech.torch_model.vits.model_infer import SynthesizerTrn |
|
from malaya_speech.torch_model.vits.commons import intersperse |
|
from malaya_speech.utils.text import TTS_SYMBOLS |
|
from malaya_speech.tts import load_text_ids |
|
import torch |
|
import os |
|
import json |
|
|
|
try: |
|
from malaya_boilerplate.hparams import HParams |
|
except BaseException: |
|
from malaya_boilerplate.train.config import HParams |
|
|
|
folder = snapshot_download(repo_id="mesolitica/VITS-multispeaker-clean-v2") |
|
|
|
with open(os.path.join(folder, 'config.json')) as fopen: |
|
hps = HParams(**json.load(fopen)) |
|
|
|
model = SynthesizerTrn( |
|
len(TTS_SYMBOLS), |
|
hps.data.filter_length // 2 + 1, |
|
hps.train.segment_size // hps.data.hop_length, |
|
n_speakers=hps.data.n_speakers, |
|
**hps.model, |
|
).eval() |
|
model.load_state_dict(torch.load(os.path.join(folder, 'model.pth'), map_location='cpu')) |
|
|
|
speaker_id = { |
|
'Ariff': 0, |
|
'Ayu': 1, |
|
'Bunga': 2, |
|
'Danial': 3, |
|
'Elina': 4, |
|
'Kamarul': 5, |
|
'Osman': 6, |
|
'Yasmin': 7 |
|
} |
|
normalizer = load_text_ids(pad_to = None, understand_punct = True, is_lower = False) |
|
|
|
t, ids = normalizer.normalize('saya nak makan nasi ayam yang sedap, lagi lazat, dan hidup sangatlah susah kan.', add_fullstop = False) |
|
if hps.data.add_blank: |
|
ids = intersperse(ids, 0) |
|
ids = torch.LongTensor(ids) |
|
ids_lengths = torch.LongTensor([ids.size(0)]) |
|
ids = ids.unsqueeze(0) |
|
sid = 0 |
|
sid = torch.tensor([sid]) |
|
|
|
with torch.no_grad(): |
|
audio = model.infer( |
|
ids, |
|
ids_lengths, |
|
noise_scale=0.0, |
|
noise_scale_w=0.0, |
|
length_scale=1.0, |
|
sid=sid, |
|
) |
|
y_ = audio[0].numpy() |
|
``` |
|
|