|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
- singing-voice-synthesis |
|
language: jp |
|
datasets: |
|
- kiritan |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 SVS model |
|
|
|
### `espnet/kiritan_svs_rnn` |
|
|
|
This model was trained by ftshijt using kiritan recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38 |
|
pip install -e . |
|
cd egs2/kiritan/svs1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model espnet/kiritan_svs_rnn |
|
``` |
|
|
|
|
|
|
|
## SVS config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf/tuning/train_naive_rnn_dp.yaml |
|
print_config: false |
|
log_level: INFO |
|
drop_last_iter: false |
|
dry_run: false |
|
iterator_type: sequence |
|
valid_iterator_type: null |
|
output_dir: exp/svs_train_naive_rnn_dp_raw_phn_pyopenjtalk_jp |
|
ngpu: 1 |
|
seed: 0 |
|
num_workers: 8 |
|
num_att_plot: 3 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: null |
|
dist_rank: null |
|
local_rank: 0 |
|
dist_master_addr: null |
|
dist_master_port: null |
|
dist_launcher: null |
|
multiprocessing_distributed: false |
|
unused_parameters: false |
|
sharded_ddp: false |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: true |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 500 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - valid |
|
- loss |
|
- min |
|
- - train |
|
- loss |
|
- min |
|
keep_nbest_models: 2 |
|
nbest_averaging_interval: 0 |
|
grad_clip: 1.0 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 1 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: false |
|
log_interval: null |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
use_lora: false |
|
save_lora_only: true |
|
lora_conf: {} |
|
pretrain_path: null |
|
init_param: [] |
|
ignore_init_mismatch: false |
|
freeze_param: [] |
|
num_iters_per_epoch: null |
|
batch_size: 16 |
|
valid_batch_size: null |
|
batch_bins: 1000000 |
|
valid_batch_bins: null |
|
train_shape_file: |
|
- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn |
|
- exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape |
|
valid_shape_file: |
|
- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn |
|
- exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape |
|
batch_type: sorted |
|
valid_batch_type: null |
|
fold_length: |
|
- 150 |
|
- 240000 |
|
sort_in_batch: descending |
|
shuffle_within_batch: false |
|
sort_batch: descending |
|
multiple_iterator: false |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
chunk_excluded_key_prefixes: [] |
|
chunk_default_fs: null |
|
train_data_path_and_name_and_type: |
|
- - dump/raw/tr_no_dev/text |
|
- text |
|
- text |
|
- - dump/raw/tr_no_dev/wav.scp |
|
- singing |
|
- sound |
|
- - dump/raw/tr_no_dev/label |
|
- label |
|
- duration |
|
- - dump/raw/tr_no_dev/score.scp |
|
- score |
|
- score |
|
valid_data_path_and_name_and_type: |
|
- - dump/raw/dev/text |
|
- text |
|
- text |
|
- - dump/raw/dev/wav.scp |
|
- singing |
|
- sound |
|
- - dump/raw/dev/label |
|
- label |
|
- duration |
|
- - dump/raw/dev/score.scp |
|
- score |
|
- score |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
allow_multi_rates: false |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adam |
|
optim_conf: |
|
lr: 0.001 |
|
eps: 1.0e-06 |
|
weight_decay: 0.0 |
|
scheduler: null |
|
scheduler_conf: {} |
|
token_list: |
|
- <blank> |
|
- <unk> |
|
- pau |
|
- a |
|
- i |
|
- o |
|
- e |
|
- u |
|
- k |
|
- n |
|
- r |
|
- t |
|
- m |
|
- d |
|
- s |
|
- N |
|
- sh |
|
- g |
|
- y |
|
- b |
|
- w |
|
- cl |
|
- ts |
|
- z |
|
- ch |
|
- j |
|
- h |
|
- f |
|
- p |
|
- ky |
|
- ry |
|
- hy |
|
- py |
|
- ny |
|
- <sos/eos> |
|
odim: null |
|
model_conf: {} |
|
use_preprocessor: true |
|
token_type: phn |
|
bpemodel: null |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: pyopenjtalk |
|
fs: 24000 |
|
score_feats_extract: syllable_score_feats |
|
score_feats_extract_conf: |
|
fs: 24000 |
|
n_fft: 2048 |
|
win_length: 1200 |
|
hop_length: 300 |
|
feats_extract: fbank |
|
feats_extract_conf: |
|
n_fft: 2048 |
|
hop_length: 300 |
|
win_length: 1200 |
|
fs: 24000 |
|
fmin: 80 |
|
fmax: 7600 |
|
n_mels: 80 |
|
normalize: global_mvn |
|
normalize_conf: |
|
stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz |
|
svs: naive_rnn_dp |
|
svs_conf: |
|
midi_dim: 129 |
|
embed_dim: 512 |
|
duration_dim: 500 |
|
eprenet_conv_layers: 0 |
|
eprenet_conv_chans: 256 |
|
eprenet_conv_filts: 3 |
|
elayers: 3 |
|
eunits: 256 |
|
ebidirectional: true |
|
midi_embed_integration_type: add |
|
dlayers: 2 |
|
dunits: 256 |
|
dbidirectional: true |
|
postnet_layers: 5 |
|
postnet_chans: 512 |
|
postnet_filts: 5 |
|
use_batch_norm: true |
|
reduction_factor: 1 |
|
eprenet_dropout_rate: 0.2 |
|
edropout_rate: 0.1 |
|
ddropout_rate: 0.1 |
|
postnet_dropout_rate: 0.5 |
|
init_type: pytorch |
|
use_masking: true |
|
pitch_extract: dio |
|
pitch_extract_conf: |
|
use_token_averaged_f0: false |
|
fs: 24000 |
|
n_fft: 2048 |
|
hop_length: 300 |
|
f0max: 800 |
|
f0min: 80 |
|
reduction_factor: 1 |
|
pitch_normalize: global_mvn |
|
pitch_normalize_conf: |
|
stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz |
|
ying_extract: null |
|
ying_extract_conf: {} |
|
energy_extract: null |
|
energy_extract_conf: {} |
|
energy_normalize: null |
|
energy_normalize_conf: {} |
|
required: |
|
- output_dir |
|
- token_list |
|
version: '202310' |
|
distributed: false |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
@inproceedings{shi22d_interspeech, |
|
author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin}, |
|
title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}}, |
|
year=2022, |
|
booktitle={Proc. Interspeech 2022}, |
|
pages={4277--4281}, |
|
doi={10.21437/Interspeech.2022-10039} |
|
} |
|
``` |
|
|
|
or arXiv: |
|
|
|
```bibtex |
|
@misc{watanabe2018espnet, |
|
title={ESPnet: End-to-End Speech Processing Toolkit}, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
year={2018}, |
|
eprint={1804.00015}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|