Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
c527edf
1
Parent(s):
573b8cf
Add joint tacotron2_hifigan config
Browse files- training/STEPS.md +24 -5
- training/esp_test.ipynb +0 -114
- training/finetune_joint_tacotron2_hifigan.yaml +255 -0
- tts_example.ipynb +6 -65
training/STEPS.md
CHANGED
@@ -5,7 +5,7 @@ Link: https://espnet.github.io/espnet/installation.html
|
|
5 |
sudo apt-get install cmake sox libsndfile1-dev ffmpeg
|
6 |
git clone --branch v.202301 https://github.com/espnet/espnet
|
7 |
cd ./espnet/tools
|
8 |
-
./setup_anaconda.sh anaconda espnet 3.
|
9 |
. ./activate_python.sh
|
10 |
make
|
11 |
pip install --upgrade torch torchaudio # or setup same versions
|
@@ -19,15 +19,17 @@ ESPNET is a dynamic framework. For the latest guide, please refer to https://git
|
|
19 |
|
20 |
This page provides general launching steps on how training was performed for reference, and this doesn't cover data preparation.
|
21 |
|
22 |
-
NOTE: before running the script below, copy [./train_vits.yaml](./train_vits.yaml) to your `<espnet_root>/egs2/ljspeech/tts1/conf/tuning
|
23 |
|
24 |
|
25 |
```sh
|
26 |
cd ../egs2/ljspeech/tts1
|
27 |
pip install torchvision # to save figures
|
28 |
-
pip install speechbrain
|
|
|
29 |
./run.sh \
|
30 |
-
--stage
|
|
|
31 |
--use_xvector true \
|
32 |
--xvector_tool speechbrain \
|
33 |
--fs 22050 \
|
@@ -41,4 +43,21 @@ pip install speechbrain
|
|
41 |
--feats_normalize none \
|
42 |
--train_config ./conf/tuning/train_vits.yaml \
|
43 |
--inference_config ./conf/tuning/decode_vits.yaml
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
sudo apt-get install cmake sox libsndfile1-dev ffmpeg
|
6 |
git clone --branch v.202301 https://github.com/espnet/espnet
|
7 |
cd ./espnet/tools
|
8 |
+
./setup_anaconda.sh anaconda espnet 3.10
|
9 |
. ./activate_python.sh
|
10 |
make
|
11 |
pip install --upgrade torch torchaudio # or setup same versions
|
|
|
19 |
|
20 |
This page provides general launching steps on how training was performed for reference, and this doesn't cover data preparation.
|
21 |
|
22 |
+
NOTE: before running the script below, copy [./train_vits.yaml](./train_vits.yaml) or [./finetune_joint_tacotron2_hifigan.yaml](./finetune_joint_tacotron2_hifigan.yaml) to your `<espnet_root>/egs2/ljspeech/tts1/conf/tuning/` folder
|
23 |
|
24 |
|
25 |
```sh
|
26 |
cd ../egs2/ljspeech/tts1
|
27 |
pip install torchvision # to save figures
|
28 |
+
pip install speechbrain # for x-vectors
|
29 |
+
# option 1: train VITS
|
30 |
./run.sh \
|
31 |
+
--stage 6 \
|
32 |
+
--min_wav_duration 0.38 \
|
33 |
--use_xvector true \
|
34 |
--xvector_tool speechbrain \
|
35 |
--fs 22050 \
|
|
|
43 |
--feats_normalize none \
|
44 |
--train_config ./conf/tuning/train_vits.yaml \
|
45 |
--inference_config ./conf/tuning/decode_vits.yaml
|
46 |
+
# option 2: train tacotron2 and hifigan jointly
|
47 |
+
./run.sh \
|
48 |
+
--stage 6 \
|
49 |
+
--min_wav_duration 0.38 \
|
50 |
+
--use_xvector true \
|
51 |
+
--xvector_tool speechbrain \
|
52 |
+
--fs 22050 \
|
53 |
+
--n_fft 1024 \
|
54 |
+
--n_shift 256 \
|
55 |
+
--win_length null \
|
56 |
+
--dumpdir dump/22k \
|
57 |
+
--expdir exp/22k \
|
58 |
+
--train_config ./conf/tuning/finetune_joint_tacotron2_hifigan.yaml \
|
59 |
+
--tts_task gan_tts
|
60 |
+
|
61 |
+
```
|
62 |
+
|
63 |
+
|
training/esp_test.ipynb
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 3,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [],
|
8 |
-
"source": [
|
9 |
-
"#@title Choose English model { run: \"auto\" }\n",
|
10 |
-
"lang = 'English'\n",
|
11 |
-
"tag = 'training/espnet/egs2/ljspeech/tts1' #@param [\"kan-bayashi/ljspeech_tacotron2\", \"kan-bayashi/ljspeech_fastspeech\", \"kan-bayashi/ljspeech_fastspeech2\", \"kan-bayashi/ljspeech_conformer_fastspeech2\", \"kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_vits\"] {type:\"string\"}\n",
|
12 |
-
"vocoder_tag = \"none\" #@param [\"none\", \"parallel_wavegan/ljspeech_parallel_wavegan.v1\", \"parallel_wavegan/ljspeech_full_band_melgan.v2\", \"parallel_wavegan/ljspeech_multi_band_melgan.v2\", \"parallel_wavegan/ljspeech_hifigan.v1\", \"parallel_wavegan/ljspeech_style_melgan.v1\"] {type:\"string\"}"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
{
|
16 |
-
"cell_type": "code",
|
17 |
-
"execution_count": 7,
|
18 |
-
"metadata": {},
|
19 |
-
"outputs": [
|
20 |
-
{
|
21 |
-
"ename": "FileNotFoundError",
|
22 |
-
"evalue": "[Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'",
|
23 |
-
"output_type": "error",
|
24 |
-
"traceback": [
|
25 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
26 |
-
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
27 |
-
"Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbin\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtts_inference\u001b[39;00m \u001b[39mimport\u001b[39;00m Text2Speech\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m str_or_none\n\u001b[0;32m----> 4\u001b[0m text2speech \u001b[39m=\u001b[39m Text2Speech(\n\u001b[1;32m 5\u001b[0m train_config\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 6\u001b[0m model_file\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 7\u001b[0m device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 8\u001b[0m \u001b[39m# Only for Tacotron 2 & Transformer\u001b[39;49;00m\n\u001b[1;32m 9\u001b[0m threshold\u001b[39m=\u001b[39;49m\u001b[39m0.5\u001b[39;49m,\n\u001b[1;32m 10\u001b[0m \u001b[39m# Only for Tacotron 2\u001b[39;49;00m\n\u001b[1;32m 11\u001b[0m minlenratio\u001b[39m=\u001b[39;49m\u001b[39m0.0\u001b[39;49m,\n\u001b[1;32m 12\u001b[0m maxlenratio\u001b[39m=\u001b[39;49m\u001b[39m10.0\u001b[39;49m,\n\u001b[1;32m 13\u001b[0m use_att_constraint\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m backward_window\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m,\n\u001b[1;32m 15\u001b[0m forward_window\u001b[39m=\u001b[39;49m\u001b[39m3\u001b[39;49m,\n\u001b[1;32m 16\u001b[0m \u001b[39m# Only for FastSpeech & FastSpeech2 & VITS\u001b[39;49;00m\n\u001b[1;32m 17\u001b[0m speed_control_alpha\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m,\n\u001b[1;32m 18\u001b[0m \u001b[39m# Only for VITS\u001b[39;49;00m\n\u001b[1;32m 19\u001b[0m noise_scale\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 20\u001b[0m noise_scale_dur\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 21\u001b[0m )\n",
|
28 |
-
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/bin/tts_inference.py:92\u001b[0m, in \u001b[0;36mText2Speech.__init__\u001b[0;34m(self, train_config, model_file, threshold, minlenratio, maxlenratio, use_teacher_forcing, use_att_constraint, backward_window, forward_window, speed_control_alpha, noise_scale, noise_scale_dur, vocoder_config, vocoder_file, dtype, device, seed, always_fix_seed, prefer_normalized_feats)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[39massert\u001b[39;00m check_argument_types()\n\u001b[1;32m 91\u001b[0m \u001b[39m# setup model\u001b[39;00m\n\u001b[0;32m---> 92\u001b[0m model, train_args \u001b[39m=\u001b[39m TTSTask\u001b[39m.\u001b[39;49mbuild_model_from_file(\n\u001b[1;32m 93\u001b[0m train_config, model_file, device\n\u001b[1;32m 94\u001b[0m )\n\u001b[1;32m 95\u001b[0m model\u001b[39m.\u001b[39mto(dtype\u001b[39m=\u001b[39m\u001b[39mgetattr\u001b[39m(torch, dtype))\u001b[39m.\u001b[39meval()\n\u001b[1;32m 96\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdevice \u001b[39m=\u001b[39m device\n",
|
29 |
-
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/abs_task.py:1822\u001b[0m, in \u001b[0;36mAbsTask.build_model_from_file\u001b[0;34m(cls, config_file, model_file, device)\u001b[0m\n\u001b[1;32m 1820\u001b[0m args \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(f)\n\u001b[1;32m 1821\u001b[0m args \u001b[39m=\u001b[39m argparse\u001b[39m.\u001b[39mNamespace(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39margs)\n\u001b[0;32m-> 1822\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mbuild_model(args)\n\u001b[1;32m 1823\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, AbsESPnetModel):\n\u001b[1;32m 1824\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1825\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mmodel must inherit \u001b[39m\u001b[39m{\u001b[39;00mAbsESPnetModel\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, but got \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mtype\u001b[39m(model)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1826\u001b[0m )\n",
|
30 |
-
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/tts.py:309\u001b[0m, in \u001b[0;36mTTSTask.build_model\u001b[0;34m(cls, args)\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[39mif\u001b[39;00m args\u001b[39m.\u001b[39mnormalize \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 308\u001b[0m normalize_class \u001b[39m=\u001b[39m normalize_choices\u001b[39m.\u001b[39mget_class(args\u001b[39m.\u001b[39mnormalize)\n\u001b[0;32m--> 309\u001b[0m normalize \u001b[39m=\u001b[39m normalize_class(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49margs\u001b[39m.\u001b[39;49mnormalize_conf)\n\u001b[1;32m 310\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 311\u001b[0m normalize \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
|
31 |
-
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/layers/global_mvn.py:40\u001b[0m, in \u001b[0;36mGlobalMVN.__init__\u001b[0;34m(self, stats_file, norm_means, norm_vars, eps)\u001b[0m\n\u001b[1;32m 37\u001b[0m stats_file \u001b[39m=\u001b[39m Path(stats_file)\n\u001b[1;32m 39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstats_file \u001b[39m=\u001b[39m stats_file\n\u001b[0;32m---> 40\u001b[0m stats \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mload(stats_file)\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(stats, np\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 42\u001b[0m \u001b[39m# Kaldi like stats\u001b[39;00m\n\u001b[1;32m 43\u001b[0m count \u001b[39m=\u001b[39m stats[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mflatten()[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
|
32 |
-
"File \u001b[0;32m~/.miniconda3/envs/espnet/lib/python3.8/site-packages/numpy/lib/npyio.py:390\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 388\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 389\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 390\u001b[0m fid \u001b[39m=\u001b[39m stack\u001b[39m.\u001b[39menter_context(\u001b[39mopen\u001b[39;49m(os_fspath(file), \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[1;32m 391\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[39m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
|
33 |
-
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'"
|
34 |
-
]
|
35 |
-
}
|
36 |
-
],
|
37 |
-
"source": [
|
38 |
-
"from espnet2.bin.tts_inference import Text2Speech\n",
|
39 |
-
"from espnet2.utils.types import str_or_none\n",
|
40 |
-
"\n",
|
41 |
-
"text2speech = Text2Speech(\n",
|
42 |
-
" train_config=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\",\n",
|
43 |
-
" model_file=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\",\n",
|
44 |
-
" device=\"cuda\",\n",
|
45 |
-
" # Only for Tacotron 2 & Transformer\n",
|
46 |
-
" threshold=0.5,\n",
|
47 |
-
" # Only for Tacotron 2\n",
|
48 |
-
" minlenratio=0.0,\n",
|
49 |
-
" maxlenratio=10.0,\n",
|
50 |
-
" use_att_constraint=False,\n",
|
51 |
-
" backward_window=1,\n",
|
52 |
-
" forward_window=3,\n",
|
53 |
-
" # Only for FastSpeech & FastSpeech2 & VITS\n",
|
54 |
-
" speed_control_alpha=4,\n",
|
55 |
-
" # Only for VITS\n",
|
56 |
-
" noise_scale=0.333,\n",
|
57 |
-
" noise_scale_dur=0.333,\n",
|
58 |
-
")\n"
|
59 |
-
]
|
60 |
-
},
|
61 |
-
{
|
62 |
-
"cell_type": "code",
|
63 |
-
"execution_count": null,
|
64 |
-
"metadata": {},
|
65 |
-
"outputs": [],
|
66 |
-
"source": [
|
67 |
-
"import time\n",
|
68 |
-
"import torch\n",
|
69 |
-
"\n",
|
70 |
-
"# decide the input sentence by yourself\n",
|
71 |
-
"print(f\"Input your favorite sentence in {lang}.\")\n",
|
72 |
-
"x = input()\n",
|
73 |
-
"\n",
|
74 |
-
"# synthesis\n",
|
75 |
-
"with torch.no_grad():\n",
|
76 |
-
" start = time.time()\n",
|
77 |
-
" wav = text2speech(x)[\"wav\"]\n",
|
78 |
-
"rtf = (time.time() - start) / (len(wav) / text2speech.fs)\n",
|
79 |
-
"print(f\"RTF = {rtf:5f}\")\n",
|
80 |
-
"\n",
|
81 |
-
"# let us listen to generated samples\n",
|
82 |
-
"from IPython.display import display, Audio\n",
|
83 |
-
"display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))"
|
84 |
-
]
|
85 |
-
}
|
86 |
-
],
|
87 |
-
"metadata": {
|
88 |
-
"kernelspec": {
|
89 |
-
"display_name": "Python 3.8.15 ('espnet')",
|
90 |
-
"language": "python",
|
91 |
-
"name": "python3"
|
92 |
-
},
|
93 |
-
"language_info": {
|
94 |
-
"codemirror_mode": {
|
95 |
-
"name": "ipython",
|
96 |
-
"version": 3
|
97 |
-
},
|
98 |
-
"file_extension": ".py",
|
99 |
-
"mimetype": "text/x-python",
|
100 |
-
"name": "python",
|
101 |
-
"nbconvert_exporter": "python",
|
102 |
-
"pygments_lexer": "ipython3",
|
103 |
-
"version": "3.8.15"
|
104 |
-
},
|
105 |
-
"orig_nbformat": 4,
|
106 |
-
"vscode": {
|
107 |
-
"interpreter": {
|
108 |
-
"hash": "baacc56cbf39183fce53815df8d7ef29797de9f36fbce345069f80337ea8dac3"
|
109 |
-
}
|
110 |
-
}
|
111 |
-
},
|
112 |
-
"nbformat": 4,
|
113 |
-
"nbformat_minor": 2
|
114 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training/finetune_joint_tacotron2_hifigan.yaml
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This EXPERIMENTAL configuration is for ESPnet2 to finetune
|
2 |
+
# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run
|
3 |
+
# this config, you need to specify "--tts_task gan_tts"
|
4 |
+
# option for tts.sh at least and use 22050 hz audio as the
|
5 |
+
# training data (mainly tested on LJspeech).
|
6 |
+
# This configuration tested on 4 GPUs with 12GB GPU memory.
|
7 |
+
# It takes around less than 1 week to finish the training but
|
8 |
+
# 100k iters model should generate reasonable results.
|
9 |
+
|
10 |
+
# YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS
|
11 |
+
# IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES.
|
12 |
+
|
13 |
+
##########################################################
|
14 |
+
# TTS MODEL SETTING #
|
15 |
+
##########################################################
|
16 |
+
tts: joint_text2wav
|
17 |
+
tts_conf:
|
18 |
+
# copied from pretrained model's config.yaml
|
19 |
+
text2mel_type: tacotron2
|
20 |
+
text2mel_params:
|
21 |
+
embed_dim: 512 # char or phn embedding dimension
|
22 |
+
elayers: 1 # number of blstm layers in encoder
|
23 |
+
eunits: 512 # number of blstm units
|
24 |
+
econv_layers: 3 # number of convolutional layers in encoder
|
25 |
+
econv_chans: 512 # number of channels in convolutional layer
|
26 |
+
econv_filts: 5 # filter size of convolutional layer
|
27 |
+
atype: location # attention function type
|
28 |
+
adim: 512 # attention dimension
|
29 |
+
aconv_chans: 32 # number of channels in convolutional layer of attention
|
30 |
+
aconv_filts: 15 # filter size of convolutional layer of attention
|
31 |
+
cumulate_att_w: true # whether to cumulate attention weight
|
32 |
+
dlayers: 2 # number of lstm layers in decoder
|
33 |
+
dunits: 1024 # number of lstm units in decoder
|
34 |
+
prenet_layers: 2 # number of layers in prenet
|
35 |
+
prenet_units: 256 # number of units in prenet
|
36 |
+
postnet_layers: 5 # number of layers in postnet
|
37 |
+
postnet_chans: 512 # number of channels in postnet
|
38 |
+
postnet_filts: 5 # filter size of postnet layer
|
39 |
+
output_activation: null # activation function for the final output
|
40 |
+
use_batch_norm: true # whether to use batch normalization in encoder
|
41 |
+
use_concate: true # whether to concatenate encoder embedding with decoder outputs
|
42 |
+
use_residual: false # whether to use residual connection in encoder
|
43 |
+
spk_embed_dim: 192 # speaker embedding dimension
|
44 |
+
spk_embed_integration_type: add # how to integrate speaker embedding
|
45 |
+
dropout_rate: 0.5 # dropout rate
|
46 |
+
zoneout_rate: 0.1 # zoneout rate
|
47 |
+
reduction_factor: 1 # reduction factor
|
48 |
+
use_masking: true # whether to apply masking for padded part in loss calculation
|
49 |
+
bce_pos_weight: 10.0 # weight of positive sample in binary cross entropy calculation
|
50 |
+
use_guided_attn_loss: true # whether to use guided attention loss
|
51 |
+
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
52 |
+
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
53 |
+
|
54 |
+
# copied from pretrained vocoder's config.yaml
|
55 |
+
vocoder_type: hifigan_generator
|
56 |
+
vocoder_params:
|
57 |
+
bias: true
|
58 |
+
channels: 512
|
59 |
+
in_channels: 80
|
60 |
+
kernel_size: 7
|
61 |
+
nonlinear_activation: LeakyReLU
|
62 |
+
nonlinear_activation_params:
|
63 |
+
negative_slope: 0.1
|
64 |
+
out_channels: 1
|
65 |
+
resblock_dilations:
|
66 |
+
- - 1
|
67 |
+
- 3
|
68 |
+
- 5
|
69 |
+
- - 1
|
70 |
+
- 3
|
71 |
+
- 5
|
72 |
+
- - 1
|
73 |
+
- 3
|
74 |
+
- 5
|
75 |
+
resblock_kernel_sizes:
|
76 |
+
- 3
|
77 |
+
- 7
|
78 |
+
- 11
|
79 |
+
upsample_kernel_sizes:
|
80 |
+
- 16
|
81 |
+
- 16
|
82 |
+
- 4
|
83 |
+
- 4
|
84 |
+
upsample_scales:
|
85 |
+
- 8
|
86 |
+
- 8
|
87 |
+
- 2
|
88 |
+
- 2
|
89 |
+
use_additional_convs: true
|
90 |
+
use_weight_norm: true
|
91 |
+
|
92 |
+
# copied from pretrained vocoder's config.yaml
|
93 |
+
discriminator_type: hifigan_multi_scale_multi_period_discriminator
|
94 |
+
discriminator_params:
|
95 |
+
follow_official_norm: true
|
96 |
+
period_discriminator_params:
|
97 |
+
bias: true
|
98 |
+
channels: 32
|
99 |
+
downsample_scales:
|
100 |
+
- 3
|
101 |
+
- 3
|
102 |
+
- 3
|
103 |
+
- 3
|
104 |
+
- 1
|
105 |
+
in_channels: 1
|
106 |
+
kernel_sizes:
|
107 |
+
- 5
|
108 |
+
- 3
|
109 |
+
max_downsample_channels: 1024
|
110 |
+
nonlinear_activation: LeakyReLU
|
111 |
+
nonlinear_activation_params:
|
112 |
+
negative_slope: 0.1
|
113 |
+
out_channels: 1
|
114 |
+
use_spectral_norm: false
|
115 |
+
use_weight_norm: true
|
116 |
+
periods:
|
117 |
+
- 2
|
118 |
+
- 3
|
119 |
+
- 5
|
120 |
+
- 7
|
121 |
+
- 11
|
122 |
+
scale_discriminator_params:
|
123 |
+
bias: true
|
124 |
+
channels: 128
|
125 |
+
downsample_scales:
|
126 |
+
- 4
|
127 |
+
- 4
|
128 |
+
- 4
|
129 |
+
- 4
|
130 |
+
- 1
|
131 |
+
in_channels: 1
|
132 |
+
kernel_sizes:
|
133 |
+
- 15
|
134 |
+
- 41
|
135 |
+
- 5
|
136 |
+
- 3
|
137 |
+
max_downsample_channels: 1024
|
138 |
+
max_groups: 16
|
139 |
+
nonlinear_activation: LeakyReLU
|
140 |
+
nonlinear_activation_params:
|
141 |
+
negative_slope: 0.1
|
142 |
+
out_channels: 1
|
143 |
+
scale_downsample_pooling: AvgPool1d
|
144 |
+
scale_downsample_pooling_params:
|
145 |
+
kernel_size: 4
|
146 |
+
padding: 2
|
147 |
+
stride: 2
|
148 |
+
scales: 3
|
149 |
+
|
150 |
+
# loss function related
|
151 |
+
generator_adv_loss_params:
|
152 |
+
average_by_discriminators: false # whether to average loss value by #discriminators
|
153 |
+
loss_type: mse # loss type, "mse" or "hinge"
|
154 |
+
discriminator_adv_loss_params:
|
155 |
+
average_by_discriminators: false # whether to average loss value by #discriminators
|
156 |
+
loss_type: mse # loss type, "mse" or "hinge"
|
157 |
+
use_feat_match_loss: true # whether to use feat match loss
|
158 |
+
feat_match_loss_params:
|
159 |
+
average_by_discriminators: false # whether to average loss value by #discriminators
|
160 |
+
average_by_layers: false # whether to average loss value by #layers of each discriminator
|
161 |
+
include_final_outputs: true # whether to include final outputs for loss calculation
|
162 |
+
use_mel_loss: true # whether to use mel-spectrogram loss
|
163 |
+
mel_loss_params:
|
164 |
+
fs: 22050 # must be the same as the training data
|
165 |
+
n_fft: 1024 # fft points
|
166 |
+
hop_length: 256 # hop size
|
167 |
+
win_length: null # window length
|
168 |
+
window: hann # window type
|
169 |
+
n_mels: 80 # number of Mel basis
|
170 |
+
fmin: 0 # minimum frequency for Mel basis
|
171 |
+
fmax: null # maximum frequency for Mel basis
|
172 |
+
log_base: null # null represent natural log
|
173 |
+
lambda_text2mel: 1.0 # loss scaling coefficient for text2mel loss
|
174 |
+
lambda_adv: 1.0 # loss scaling coefficient for adversarial loss
|
175 |
+
lambda_mel: 45.0 # loss scaling coefficient for Mel loss
|
176 |
+
lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
|
177 |
+
|
178 |
+
# others
|
179 |
+
sampling_rate: 22050 # needed in the inference for saving wav
|
180 |
+
segment_size: 32 # segment size for random windowed discriminator
|
181 |
+
cache_generator_outputs: true # whether to cache generator outputs in the training
|
182 |
+
|
183 |
+
# extra module for additional inputs
|
184 |
+
#pitch_extract: dio # pitch extractor type
|
185 |
+
#pitch_extract_conf:
|
186 |
+
# reduction_factor: 1
|
187 |
+
#pitch_normalize: global_mvn # normalizer for the pitch feature
|
188 |
+
#energy_extract: energy # energy extractor type
|
189 |
+
#energy_extract_conf:
|
190 |
+
# reduction_factor: 1
|
191 |
+
#energy_normalize: global_mvn # normalizer for the energy feature
|
192 |
+
|
193 |
+
# initialization (might need to modify for your own pretrained model)
|
194 |
+
init_param:
|
195 |
+
- exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
|
196 |
+
- exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
|
197 |
+
- exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
|
198 |
+
|
199 |
+
##########################################################
|
200 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
201 |
+
##########################################################
|
202 |
+
# optimizer setting for generator
|
203 |
+
optim: adam
|
204 |
+
optim_conf:
|
205 |
+
lr: 1.25e-5
|
206 |
+
betas: [0.5, 0.9]
|
207 |
+
weight_decay: 0.0
|
208 |
+
scheduler: exponentiallr
|
209 |
+
scheduler_conf:
|
210 |
+
gamma: 0.999875
|
211 |
+
# optimizer setting for discriminator
|
212 |
+
optim2: adam
|
213 |
+
optim2_conf:
|
214 |
+
lr: 1.25e-5
|
215 |
+
betas: [0.5, 0.9]
|
216 |
+
weight_decay: 0.0
|
217 |
+
scheduler2: exponentiallr
|
218 |
+
scheduler2_conf:
|
219 |
+
gamma: 0.999875
|
220 |
+
generator_first: true # whether to start updating generator first
|
221 |
+
|
222 |
+
##########################################################
|
223 |
+
# OTHER TRAINING SETTING #
|
224 |
+
##########################################################
|
225 |
+
#num_iters_per_epoch: 1000 # number of iterations per epoch
|
226 |
+
max_epoch: 500 # number of epochs
|
227 |
+
accum_grad: 1 # gradient accumulation
|
228 |
+
batch_bins: 1600000 # batch bins (feats_type=raw)
|
229 |
+
batch_type: numel # how to make batch
|
230 |
+
grad_clip: -1 # gradient clipping norm
|
231 |
+
grad_noise: false # whether to use gradient noise injection
|
232 |
+
sort_in_batch: descending # how to sort data in making batch
|
233 |
+
sort_batch: descending # how to sort created batches
|
234 |
+
num_workers: 4 # number of workers of data loader
|
235 |
+
use_amp: false # whether to use pytorch amp
|
236 |
+
log_interval: 50 # log interval in iterations
|
237 |
+
keep_nbest_models: 5 # number of models to keep
|
238 |
+
num_att_plot: 3 # number of attention figures to be saved in every check
|
239 |
+
seed: 777 # random seed number
|
240 |
+
patience: null # patience for early stopping
|
241 |
+
unused_parameters: true # needed for multi gpu case
|
242 |
+
best_model_criterion: # criterion to save the best models
|
243 |
+
- - valid
|
244 |
+
- text2mel_loss
|
245 |
+
- min
|
246 |
+
- - train
|
247 |
+
- text2mel_loss
|
248 |
+
- min
|
249 |
+
- - train
|
250 |
+
- total_count
|
251 |
+
- max
|
252 |
+
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
|
253 |
+
# in the case of GAN-TTS training, we strongly recommend setting to false
|
254 |
+
cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it
|
255 |
+
# therefore, we set to false as a default (recommend trying both cases)
|
tts_example.ipynb
CHANGED
@@ -11,58 +11,14 @@
|
|
11 |
},
|
12 |
{
|
13 |
"cell_type": "code",
|
14 |
-
"execution_count":
|
15 |
"metadata": {},
|
16 |
-
"outputs": [
|
17 |
-
{
|
18 |
-
"name": "stdout",
|
19 |
-
"output_type": "stream",
|
20 |
-
"text": [
|
21 |
-
"downloading uk/mykyta/vits-tts\n",
|
22 |
-
"Found ./model.pth. Skipping download...\n",
|
23 |
-
"Found ./config.yaml. Skipping download...\n"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"name": "stderr",
|
28 |
-
"output_type": "stream",
|
29 |
-
"text": [
|
30 |
-
"/Users/robinhad/Projects/ukrainian-tts/.venv/lib/python3.9/site-packages/espnet2/gan_tts/vits/monotonic_align/__init__.py:19: UserWarning: Cython version is not available. Fallback to 'EXPERIMETAL' numba version. If you want to use the cython version, please build it as follows: `cd espnet2/gan_tts/vits/monotonic_align; python setup.py build_ext --inplace`\n",
|
31 |
-
" warnings.warn(\n"
|
32 |
-
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"name": "stdout",
|
36 |
-
"output_type": "stream",
|
37 |
-
"text": [
|
38 |
-
"RTF = 0.213155\n",
|
39 |
-
"Accented text: прив+іт, +як +у теб+е спр+ави?\n"
|
40 |
-
]
|
41 |
-
},
|
42 |
-
{
|
43 |
-
"data": {
|
44 |
-
"text/html": [
|
45 |
-
"\n",
|
46 |
-
" <audio controls=\"controls\" >\n",
|
47 |
-
" <source src=\"data:audio/x-wav;base64,\" type=\"audio/x-wav\" />\n",
|
48 |
-
" Your browser does not support the audio element.\n",
|
49 |
-
" </audio>\n",
|
50 |
-
" "
|
51 |
-
],
|
52 |
-
"text/plain": [
|
53 |
-
"<IPython.lib.display.Audio object>"
|
54 |
-
]
|
55 |
-
},
|
56 |
-
"execution_count": 1,
|
57 |
-
"metadata": {},
|
58 |
-
"output_type": "execute_result"
|
59 |
-
}
|
60 |
-
],
|
61 |
"source": [
|
62 |
"from ukrainian_tts.tts import TTS, Voices, Stress\n",
|
63 |
"import IPython.display as ipd\n",
|
64 |
"\n",
|
65 |
-
"tts = TTS(device=\"cpu\") # can try gpu, mps\n",
|
66 |
"with open(\"test.wav\", mode=\"wb\") as file:\n",
|
67 |
" _, output_text = tts.tts(\"Привіт, як у тебе справи?\", Voices.Dmytro.value, Stress.Dictionary.value, file)\n",
|
68 |
"print(\"Accented text:\", output_text)\n",
|
@@ -72,24 +28,9 @@
|
|
72 |
},
|
73 |
{
|
74 |
"cell_type": "code",
|
75 |
-
"execution_count":
|
76 |
"metadata": {},
|
77 |
-
"outputs": [
|
78 |
-
{
|
79 |
-
"data": {
|
80 |
-
"text/plain": [
|
81 |
-
"[<Voices.Olena: 4>,\n",
|
82 |
-
" <Voices.Mykyta: 3>,\n",
|
83 |
-
" <Voices.Lada: 2>,\n",
|
84 |
-
" <Voices.Dmytro: 1>,\n",
|
85 |
-
" <Voices.Olga: 5>]"
|
86 |
-
]
|
87 |
-
},
|
88 |
-
"execution_count": 2,
|
89 |
-
"metadata": {},
|
90 |
-
"output_type": "execute_result"
|
91 |
-
}
|
92 |
-
],
|
93 |
"source": [
|
94 |
"[voice for voice in Voices]"
|
95 |
]
|
@@ -111,7 +52,7 @@
|
|
111 |
"name": "python",
|
112 |
"nbconvert_exporter": "python",
|
113 |
"pygments_lexer": "ipython3",
|
114 |
-
"version": "3.
|
115 |
},
|
116 |
"orig_nbformat": 4,
|
117 |
"vscode": {
|
|
|
11 |
},
|
12 |
{
|
13 |
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
"metadata": {},
|
16 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"source": [
|
18 |
"from ukrainian_tts.tts import TTS, Voices, Stress\n",
|
19 |
"import IPython.display as ipd\n",
|
20 |
"\n",
|
21 |
+
"tts = TTS(device=\"cpu\", cache_dir=\"model\") # can try gpu, mps\n",
|
22 |
"with open(\"test.wav\", mode=\"wb\") as file:\n",
|
23 |
" _, output_text = tts.tts(\"Привіт, як у тебе справи?\", Voices.Dmytro.value, Stress.Dictionary.value, file)\n",
|
24 |
"print(\"Accented text:\", output_text)\n",
|
|
|
28 |
},
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
+
"execution_count": null,
|
32 |
"metadata": {},
|
33 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"source": [
|
35 |
"[voice for voice in Voices]"
|
36 |
]
|
|
|
52 |
"name": "python",
|
53 |
"nbconvert_exporter": "python",
|
54 |
"pygments_lexer": "ipython3",
|
55 |
+
"version": "3.10.12"
|
56 |
},
|
57 |
"orig_nbformat": 4,
|
58 |
"vscode": {
|