Spaces:
Sleeping
Sleeping
yuancwang
commited on
Commit
·
9893813
1
Parent(s):
b725c5a
add app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +1 -0
- app.py +110 -9
- egs/datasets/README.md +381 -0
- egs/metrics/README.md +94 -0
- egs/metrics/run.sh +42 -0
- egs/svc/DiffComoSVC/README.md +234 -0
- egs/svc/DiffComoSVC/exp_config.json +143 -0
- egs/svc/DiffComoSVC/run.sh +1 -0
- egs/svc/MultipleContentsSVC/README.md +153 -0
- egs/svc/MultipleContentsSVC/exp_config.json +126 -0
- egs/svc/MultipleContentsSVC/run.sh +1 -0
- egs/svc/README.md +34 -0
- egs/svc/TransformerSVC/README.md +164 -0
- egs/svc/TransformerSVC/exp_config.json +108 -0
- egs/svc/TransformerSVC/run.sh +1 -0
- egs/svc/VitsSVC/README.md +125 -0
- egs/svc/VitsSVC/exp_config.json +162 -0
- egs/svc/VitsSVC/run.sh +1 -0
- egs/svc/_template/run.sh +150 -0
- egs/tta/README.md +19 -0
- egs/tta/RECIPE.md +156 -0
- egs/tta/audioldm/exp_config.json +90 -0
- egs/tta/audioldm/exp_config_base.json +11 -0
- egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
- egs/tta/audioldm/run_inference.sh +52 -0
- egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
- egs/tta/audioldm/run_train.sh +26 -0
- egs/tta/audioldm/run_train_latent_4_10_78.sh +26 -0
- egs/tta/autoencoderkl/exp_config.json +49 -0
- egs/tta/autoencoderkl/exp_config_base.json +11 -0
- egs/tta/autoencoderkl/exp_config_latent_4_10_78.json +59 -0
- egs/tta/autoencoderkl/run_train.sh +26 -0
- egs/tta/autoencoderkl/run_train_latent_4_10_78.sh +26 -0
- egs/tts/FastSpeech2/README.md +132 -0
- egs/tts/FastSpeech2/exp_config.json +21 -0
- egs/tts/FastSpeech2/prepare_mfa.sh +14 -0
- egs/tts/FastSpeech2/run.sh +150 -0
- egs/tts/NaturalSpeech2/exp_config.json +39 -0
- egs/tts/NaturalSpeech2/exp_config_base.json +118 -0
- egs/tts/NaturalSpeech2/run_inference.sh +43 -0
- egs/tts/NaturalSpeech2/run_train.sh +18 -0
- egs/tts/README.md +17 -0
- egs/tts/VALLE/README.md +139 -0
- egs/tts/VALLE/exp_config.json +33 -0
- egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt +1 -0
- egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt +1 -0
- egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt +1 -0
- egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt +1 -0
- egs/tts/VALLE/run.sh +158 -0
- egs/tts/VITS/README.md +135 -0
.gitignore
CHANGED
@@ -35,6 +35,7 @@ egs/svc/dev_exp_config.json
|
|
35 |
bins/svc/demo*
|
36 |
bins/svc/preprocess_custom.py
|
37 |
data
|
|
|
38 |
|
39 |
# Data and ckpt
|
40 |
*.pkl
|
|
|
35 |
bins/svc/demo*
|
36 |
bins/svc/preprocess_custom.py
|
37 |
data
|
38 |
+
ckpts
|
39 |
|
40 |
# Data and ckpt
|
41 |
*.pkl
|
app.py
CHANGED
@@ -1,24 +1,125 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import os
|
3 |
import torch
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def build_model():
|
11 |
-
...
|
12 |
|
13 |
def ns2_inference(
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
):
|
18 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
demo = gr.Interface(
|
24 |
fn=ns2_inference,
|
|
|
1 |
import gradio as gr
|
2 |
+
import argparse
|
3 |
import os
|
4 |
import torch
|
5 |
+
import soundfile as sf
|
6 |
+
import numpy as np
|
7 |
|
8 |
+
from models.tts.naturalspeech2.ns2 import NaturalSpeech2
|
9 |
+
from encodec import EncodecModel
|
10 |
+
from encodec.utils import convert_audio
|
11 |
+
from utils.util import load_config
|
12 |
|
13 |
+
from text import text_to_sequence
|
14 |
+
from text.cmudict import valid_symbols
|
15 |
+
from text.g2p import preprocess_english, read_lexicon
|
16 |
|
17 |
+
import torchaudio
|
18 |
+
|
19 |
+
|
20 |
+
def build_codec(device):
|
21 |
+
encodec_model = EncodecModel.encodec_model_24khz()
|
22 |
+
encodec_model = encodec_model.to(device=device)
|
23 |
+
encodec_model.set_target_bandwidth(12.0)
|
24 |
+
return encodec_model
|
25 |
+
|
26 |
+
def build_model(cfg, device):
|
27 |
+
|
28 |
+
model = NaturalSpeech2(cfg.model)
|
29 |
+
model.load_state_dict(
|
30 |
+
torch.load(
|
31 |
+
"ckpts/ns2/pytorch_model.bin",
|
32 |
+
map_location="cpu",
|
33 |
+
)
|
34 |
+
)
|
35 |
+
model = model.to(device=device)
|
36 |
+
return model
|
37 |
|
|
|
|
|
38 |
|
39 |
def ns2_inference(
|
40 |
+
prmopt_audio_path,
|
41 |
+
text,
|
42 |
+
diffusion_steps=100,
|
43 |
):
|
44 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
45 |
|
46 |
+
os.environ["WORK_DIR"] = "./"
|
47 |
+
cfg = load_config("egs/tts/NaturalSpeech2/exp_config.json")
|
48 |
+
|
49 |
+
model = build_model(cfg, device)
|
50 |
+
codec = build_codec(device)
|
51 |
+
|
52 |
+
ref_wav_path = prmopt_audio_path
|
53 |
+
ref_wav, sr = torchaudio.load(ref_wav_path)
|
54 |
+
ref_wav = convert_audio(
|
55 |
+
ref_wav, sr, codec.sample_rate, codec.channels
|
56 |
+
)
|
57 |
+
ref_wav = ref_wav.unsqueeze(0).to(device=device)
|
58 |
+
|
59 |
+
with torch.no_grad():
|
60 |
+
encoded_frames = codec.encode(ref_wav)
|
61 |
+
ref_code = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
|
62 |
+
|
63 |
+
ref_mask = torch.ones(ref_code.shape[0], ref_code.shape[-1]).to(ref_code.device)
|
64 |
+
|
65 |
+
symbols = valid_symbols + ["sp", "spn", "sil"] + ["<s>", "</s>"]
|
66 |
+
phone2id = {s: i for i, s in enumerate(symbols)}
|
67 |
+
id2phone = {i: s for s, i in phone2id.items()}
|
68 |
+
|
69 |
+
lexicon = read_lexicon(cfg.preprocess.lexicon_path)
|
70 |
+
phone_seq = preprocess_english(text, lexicon)
|
71 |
+
|
72 |
+
|
73 |
+
phone_id = np.array(
|
74 |
+
[
|
75 |
+
*map(
|
76 |
+
phone2id.get,
|
77 |
+
phone_seq.replace("{", "").replace("}", "").split(),
|
78 |
+
)
|
79 |
+
]
|
80 |
+
)
|
81 |
+
phone_id = torch.from_numpy(phone_id).unsqueeze(0).to(device=device)
|
82 |
+
|
83 |
+
|
84 |
+
x0, prior_out = model.inference(
|
85 |
+
ref_code, phone_id, ref_mask, diffusion_steps
|
86 |
+
)
|
87 |
+
|
88 |
+
latent_ref = codec.quantizer.vq.decode(ref_code.transpose(0, 1))
|
89 |
+
rec_wav = codec.decoder(x0)
|
90 |
+
|
91 |
+
os.makedirs("result", exist_ok=True)
|
92 |
+
sf.write(
|
93 |
+
"result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result"),
|
94 |
+
rec_wav[0, 0].detach().cpu().numpy(),
|
95 |
+
samplerate=24000,
|
96 |
+
)
|
97 |
+
|
98 |
+
result_file = "result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result")
|
99 |
+
return result_file
|
100 |
+
|
101 |
+
|
102 |
+
demo_inputs = [
|
103 |
+
gr.Audio(
|
104 |
+
sources=["upload", "microphone"],
|
105 |
+
label="Upload a reference speech you want to clone timbre",
|
106 |
+
type="filepath",
|
107 |
+
),
|
108 |
+
gr.Textbox(
|
109 |
+
value="Amphion is a toolkit that can speak, make sounds, and sing.",
|
110 |
+
label="Text you want to generate",
|
111 |
+
type="text",
|
112 |
+
),
|
113 |
+
gr.Slider(
|
114 |
+
10,
|
115 |
+
1000,
|
116 |
+
value=200,
|
117 |
+
step=1,
|
118 |
+
label="Diffusion Inference Steps",
|
119 |
+
info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
|
120 |
+
),
|
121 |
+
]
|
122 |
+
demo_outputs = gr.Audio(label="")
|
123 |
|
124 |
demo = gr.Interface(
|
125 |
fn=ns2_inference,
|
egs/datasets/README.md
ADDED
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Datasets Format
|
2 |
+
|
3 |
+
Amphion support the following academic datasets (sort alphabetically):
|
4 |
+
|
5 |
+
- [Datasets Format](#datasets-format)
|
6 |
+
- [AudioCaps](#audiocaps)
|
7 |
+
- [CSD](#csd)
|
8 |
+
- [KiSing](#kising)
|
9 |
+
- [LibriTTS](#libritts)
|
10 |
+
- [LJSpeech](#ljspeech)
|
11 |
+
- [M4Singer](#m4singer)
|
12 |
+
- [NUS-48E](#nus-48e)
|
13 |
+
- [Opencpop](#opencpop)
|
14 |
+
- [OpenSinger](#opensinger)
|
15 |
+
- [Opera](#opera)
|
16 |
+
- [PopBuTFy](#popbutfy)
|
17 |
+
- [PopCS](#popcs)
|
18 |
+
- [PJS](#pjs)
|
19 |
+
- [SVCC](#svcc)
|
20 |
+
- [VCTK](#vctk)
|
21 |
+
|
22 |
+
The downloading link and the file structure tree of each dataset is displayed as follows.
|
23 |
+
|
24 |
+
## AudioCaps
|
25 |
+
|
26 |
+
AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
|
27 |
+
|
28 |
+
```plaintext
|
29 |
+
[AudioCaps dataset path]
|
30 |
+
┣ AudioCpas
|
31 |
+
┃ ┣ wav
|
32 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
33 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
34 |
+
┃ ┃ ┣ ...
|
35 |
+
```
|
36 |
+
|
37 |
+
## CSD
|
38 |
+
|
39 |
+
The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
|
40 |
+
|
41 |
+
```plaintext
|
42 |
+
[CSD dataset path]
|
43 |
+
┣ english
|
44 |
+
┣ korean
|
45 |
+
┣ utterances
|
46 |
+
┃ ┣ en001a
|
47 |
+
┃ ┃ ┣ {UtterenceID}.wav
|
48 |
+
┃ ┣ en001b
|
49 |
+
┃ ┣ en002a
|
50 |
+
┃ ┣ en002b
|
51 |
+
┃ ┣ ...
|
52 |
+
┣ README
|
53 |
+
```
|
54 |
+
|
55 |
+
## KiSing
|
56 |
+
|
57 |
+
The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
|
58 |
+
|
59 |
+
```plaintext
|
60 |
+
[KiSing dataset path]
|
61 |
+
┣ clean
|
62 |
+
┃ ┣ 421
|
63 |
+
┃ ┣ 422
|
64 |
+
┃ ┣ ...
|
65 |
+
```
|
66 |
+
|
67 |
+
## LibriTTS
|
68 |
+
|
69 |
+
The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
|
70 |
+
|
71 |
+
```plaintext
|
72 |
+
[LibriTTS dataset path]
|
73 |
+
┣ BOOKS.txt
|
74 |
+
┣ CHAPTERS.txt
|
75 |
+
┣ eval_sentences10.tsv
|
76 |
+
┣ LICENSE.txt
|
77 |
+
┣ NOTE.txt
|
78 |
+
┣ reader_book.tsv
|
79 |
+
┣ README_librispeech.txt
|
80 |
+
┣ README_libritts.txt
|
81 |
+
┣ speakers.tsv
|
82 |
+
┣ SPEAKERS.txt
|
83 |
+
┣ dev-clean (Subset)
|
84 |
+
┃ ┣ 1272{Speaker_ID}
|
85 |
+
┃ ┃ ┣ 128104 {Chapter_ID}
|
86 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
|
87 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
|
88 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
|
89 |
+
┃ ┃ ┃ ┣ ...
|
90 |
+
┃ ┃ ┃ ┣ 1272_128104.book.tsv
|
91 |
+
┃ ┃ ┃ ┣ 1272_128104.trans.tsv
|
92 |
+
┃ ┃ ┣ ...
|
93 |
+
┃ ┣ ...
|
94 |
+
┣ dev-other (Subset)
|
95 |
+
┃ ┣ 116 (Speaker)
|
96 |
+
┃ ┃ ┣ 288045 {Chapter_ID}
|
97 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
|
98 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
|
99 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
|
100 |
+
┃ ┃ ┃ ┣ ...
|
101 |
+
┃ ┃ ┃ ┣ 116_288045.book.tsv
|
102 |
+
┃ ┃ ┃ ┣ 116_288045.trans.tsv
|
103 |
+
┃ ┃ ┣ ...
|
104 |
+
┃ ┣ ...
|
105 |
+
┃ ┣ ...
|
106 |
+
┣ test-clean (Subset)
|
107 |
+
┃ ┣ {Speaker_ID}
|
108 |
+
┃ ┃ ┣ {Chapter_ID}
|
109 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
110 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
111 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
112 |
+
┃ ┃ ┃ ┣ ...
|
113 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
114 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
115 |
+
┃ ┃ ┣ ...
|
116 |
+
┃ ┣ ...
|
117 |
+
┣ test-other
|
118 |
+
┃ ┣ {Speaker_ID}
|
119 |
+
┃ ┃ ┣ {Chapter_ID}
|
120 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
121 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
122 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
123 |
+
┃ ┃ ┃ ┣ ...
|
124 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
125 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
126 |
+
┃ ┃ ┣ ...
|
127 |
+
┃ ┣ ...
|
128 |
+
┣ train-clean-100
|
129 |
+
┃ ┣ {Speaker_ID}
|
130 |
+
┃ ┃ ┣ {Chapter_ID}
|
131 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
132 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
133 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
134 |
+
┃ ┃ ┃ ┣ ...
|
135 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
136 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
137 |
+
┃ ┃ ┣ ...
|
138 |
+
┃ ┣ ...
|
139 |
+
┣ train-clean-360
|
140 |
+
┃ ┣ {Speaker_ID}
|
141 |
+
┃ ┃ ┣ {Chapter_ID}
|
142 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
143 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
144 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
145 |
+
┃ ┃ ┃ ┣ ...
|
146 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
147 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
148 |
+
┃ ┃ ┣ ...
|
149 |
+
┃ ┣ ...
|
150 |
+
┣ train-other-500
|
151 |
+
┃ ┣ {Speaker_ID}
|
152 |
+
┃ ┃ ┣ {Chapter_ID}
|
153 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
154 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
155 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
156 |
+
┃ ┃ ┃ ┣ ...
|
157 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
158 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
159 |
+
┃ ┃ ┣ ...
|
160 |
+
┃ ┣ ...
|
161 |
+
```
|
162 |
+
|
163 |
+
|
164 |
+
## LJSpeech
|
165 |
+
|
166 |
+
The official LJSpeech dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
|
167 |
+
|
168 |
+
```plaintext
|
169 |
+
[LJSpeech dataset path]
|
170 |
+
┣ metadata.csv
|
171 |
+
┣ wavs
|
172 |
+
┃ ┣ LJ001-0001.wav
|
173 |
+
┃ ┣ LJ001-0002.wav
|
174 |
+
┃ ┣ ...
|
175 |
+
┣ README
|
176 |
+
```
|
177 |
+
|
178 |
+
## M4Singer
|
179 |
+
|
180 |
+
The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
|
181 |
+
|
182 |
+
```plaintext
|
183 |
+
[M4Singer dataset path]
|
184 |
+
┣ {Singer_1}#{Song_1}
|
185 |
+
┃ ┣ 0000.mid
|
186 |
+
┃ ┣ 0000.TextGrid
|
187 |
+
┃ ┣ 0000.wav
|
188 |
+
┃ ┣ ...
|
189 |
+
┣ {Singer_1}#{Song_2}
|
190 |
+
┣ ...
|
191 |
+
┣ {Singer_2}#{Song_1}
|
192 |
+
┣ {Singer_2}#{Song_2}
|
193 |
+
┣ ...
|
194 |
+
┗ meta.json
|
195 |
+
```
|
196 |
+
|
197 |
+
## NUS-48E
|
198 |
+
|
199 |
+
The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
|
200 |
+
|
201 |
+
```plaintext
|
202 |
+
[NUS-48E dataset path]
|
203 |
+
┣ {SpeakerID}
|
204 |
+
┃ ┣ read
|
205 |
+
┃ ┃ ┣ {SongID}.txt
|
206 |
+
┃ ┃ ┣ {SongID}.wav
|
207 |
+
┃ ┃ ┣ ...
|
208 |
+
┃ ┣ sing
|
209 |
+
┃ ┃ ┣ {SongID}.txt
|
210 |
+
┃ ┃ ┣ {SongID}.wav
|
211 |
+
┃ ┃ ┣ ...
|
212 |
+
┣ ...
|
213 |
+
┣ README.txt
|
214 |
+
|
215 |
+
```
|
216 |
+
|
217 |
+
## Opencpop
|
218 |
+
|
219 |
+
The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
|
220 |
+
|
221 |
+
```plaintext
|
222 |
+
[Opencpop dataset path]
|
223 |
+
┣ midis
|
224 |
+
┃ ┣ 2001.midi
|
225 |
+
┃ ┣ 2002.midi
|
226 |
+
┃ ┣ 2003.midi
|
227 |
+
┃ ┣ ...
|
228 |
+
┣ segments
|
229 |
+
┃ ┣ wavs
|
230 |
+
┃ ┃ ┣ 2001000001.wav
|
231 |
+
┃ ┃ ┣ 2001000002.wav
|
232 |
+
┃ ┃ ┣ 2001000003.wav
|
233 |
+
┃ ┃ ┣ ...
|
234 |
+
┃ ┣ test.txt
|
235 |
+
┃ ┣ train.txt
|
236 |
+
┃ ┗ transcriptions.txt
|
237 |
+
┣ textgrids
|
238 |
+
┃ ┣ 2001.TextGrid
|
239 |
+
┃ ┣ 2002.TextGrid
|
240 |
+
┃ ┣ 2003.TextGrid
|
241 |
+
┃ ┣ ...
|
242 |
+
┣ wavs
|
243 |
+
┃ ┣ 2001.wav
|
244 |
+
┃ ┣ 2002.wav
|
245 |
+
┃ ┣ 2003.wav
|
246 |
+
┃ ┣ ...
|
247 |
+
┣ TERMS_OF_ACCESS
|
248 |
+
┗ readme.md
|
249 |
+
```
|
250 |
+
|
251 |
+
## OpenSinger
|
252 |
+
|
253 |
+
The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
|
254 |
+
|
255 |
+
```plaintext
|
256 |
+
[OpenSinger dataset path]
|
257 |
+
┣ ManRaw
|
258 |
+
┃ ┣ {Singer_1}_{Song_1}
|
259 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
|
260 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
|
261 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
|
262 |
+
┃ ┃ ┣ ...
|
263 |
+
┃ ┣ {Singer_1}_{Song_2}
|
264 |
+
┃ ┣ ...
|
265 |
+
┣ WomanRaw
|
266 |
+
┣ LICENSE
|
267 |
+
┗ README.md
|
268 |
+
```
|
269 |
+
|
270 |
+
## Opera
|
271 |
+
|
272 |
+
The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
|
273 |
+
|
274 |
+
```plaintext
|
275 |
+
[Opera dataset path]
|
276 |
+
┣ monophonic
|
277 |
+
┃ ┣ chinese
|
278 |
+
┃ ┃ ┣ {Gender}_{SingerID}
|
279 |
+
┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
|
280 |
+
┃ ┃ ┃ ┣ ...
|
281 |
+
┃ ┃ ┣ ...
|
282 |
+
┃ ┣ western
|
283 |
+
┣ polyphonic
|
284 |
+
┃ ┣ chinese
|
285 |
+
┃ ┣ western
|
286 |
+
┣ CrossculturalDataSet.xlsx
|
287 |
+
```
|
288 |
+
|
289 |
+
## PopBuTFy
|
290 |
+
|
291 |
+
The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
|
292 |
+
|
293 |
+
```plaintext
|
294 |
+
[PopBuTFy dataset path]
|
295 |
+
┣ data
|
296 |
+
┃ ┣ {SingerID}#singing#{SongName}_Amateur
|
297 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
|
298 |
+
┃ ┃ ┣ ...
|
299 |
+
┃ ┣ {SingerID}#singing#{SongName}_Professional
|
300 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
|
301 |
+
┃ ┃ ┣ ...
|
302 |
+
┣ text_labels
|
303 |
+
┗ TERMS_OF_ACCESS
|
304 |
+
```
|
305 |
+
|
306 |
+
## PopCS
|
307 |
+
|
308 |
+
The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
|
309 |
+
|
310 |
+
```plaintext
|
311 |
+
[PopCS dataset path]
|
312 |
+
┣ popcs
|
313 |
+
┃ ┣ popcs-{SongName}
|
314 |
+
┃ ┃ ┣ {UtteranceID}_ph.txt
|
315 |
+
┃ ┃ ┣ {UtteranceID}_wf0.wav
|
316 |
+
┃ ┃ ┣ {UtteranceID}.TextGrid
|
317 |
+
┃ ┃ ┣ {UtteranceID}.txt
|
318 |
+
┃ ┃ ┣ ...
|
319 |
+
┃ ┣ ...
|
320 |
+
┗ TERMS_OF_ACCESS
|
321 |
+
```
|
322 |
+
|
323 |
+
## PJS
|
324 |
+
|
325 |
+
The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
|
326 |
+
|
327 |
+
```plaintext
|
328 |
+
[PJS dataset path]
|
329 |
+
┣ PJS_corpus_ver1.1
|
330 |
+
┃ ┣ background_noise
|
331 |
+
┃ ┣ pjs{SongID}
|
332 |
+
┃ ┃ ┣ pjs{SongID}_song.wav
|
333 |
+
┃ ┃ ┣ pjs{SongID}_speech.wav
|
334 |
+
┃ ┃ ┣ pjs{SongID}.lab
|
335 |
+
┃ ┃ ┣ pjs{SongID}.mid
|
336 |
+
┃ ┃ ┣ pjs{SongID}.musicxml
|
337 |
+
┃ ┃ ┣ pjs{SongID}.txt
|
338 |
+
┃ ┣ ...
|
339 |
+
```
|
340 |
+
|
341 |
+
## SVCC
|
342 |
+
|
343 |
+
The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
|
344 |
+
|
345 |
+
```plaintext
|
346 |
+
[SVCC dataset path]
|
347 |
+
┣ Data
|
348 |
+
┃ ┣ CDF1
|
349 |
+
┃ ┃ ┣ 10001.wav
|
350 |
+
┃ ┃ ┣ 10002.wav
|
351 |
+
┃ ┃ ┣ ...
|
352 |
+
┃ ┣ CDM1
|
353 |
+
┃ ┣ IDF1
|
354 |
+
┃ ┣ IDM1
|
355 |
+
┗ README.md
|
356 |
+
```
|
357 |
+
|
358 |
+
## VCTK
|
359 |
+
|
360 |
+
The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
|
361 |
+
|
362 |
+
```plaintext
|
363 |
+
[VCTK dataset path]
|
364 |
+
┣ txt
|
365 |
+
┃ ┣ {Speaker_1}
|
366 |
+
┃ ┃ ┣ {Speaker_1}_001.txt
|
367 |
+
┃ ┃ ┣ {Speaker_1}_002.txt
|
368 |
+
┃ ┃ ┣ ...
|
369 |
+
┃ ┣ {Speaker_2}
|
370 |
+
┃ ┣ ...
|
371 |
+
┣ wav48_silence_trimmed
|
372 |
+
┃ ┣ {Speaker_1}
|
373 |
+
┃ ┃ ┣ {Speaker_1}_001_mic1.flac
|
374 |
+
┃ ┃ ┣ {Speaker_1}_001_mic2.flac
|
375 |
+
┃ ┃ ┣ {Speaker_1}_002_mic1.flac
|
376 |
+
┃ ┃ ┣ ...
|
377 |
+
┃ ┣ {Speaker_2}
|
378 |
+
┃ ┣ ...
|
379 |
+
┣ speaker-info.txt
|
380 |
+
┗ update.txt
|
381 |
+
```
|
egs/metrics/README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Evaluation Recipe
|
2 |
+
|
3 |
+
## Supported Evaluation Metrics
|
4 |
+
|
5 |
+
Until now, Amphion Evaluation has supported the following objective metrics:
|
6 |
+
|
7 |
+
- **F0 Modeling**:
|
8 |
+
- F0 Pearson Coefficients (FPC)
|
9 |
+
- F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
|
10 |
+
- F0 Root Mean Square Error (F0RMSE)
|
11 |
+
- Voiced/Unvoiced F1 Score (V/UV F1)
|
12 |
+
- **Energy Modeling**:
|
13 |
+
- Energy Root Mean Square Error (EnergyRMSE)
|
14 |
+
- Energy Pearson Coefficients (EnergyPC)
|
15 |
+
- **Intelligibility**:
|
16 |
+
- Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
|
17 |
+
- Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
|
18 |
+
- **Spectrogram Distortion**:
|
19 |
+
- Frechet Audio Distance (FAD)
|
20 |
+
- Mel Cepstral Distortion (MCD)
|
21 |
+
- Multi-Resolution STFT Distance (MSTFT)
|
22 |
+
- Perceptual Evaluation of Speech Quality (PESQ)
|
23 |
+
- Short Time Objective Intelligibility (STOI)
|
24 |
+
- Scale Invariant Signal to Distortion Ratio (SISDR)
|
25 |
+
- Scale Invariant Signal to Noise Ratio (SISNR)
|
26 |
+
- **Speaker Similarity**:
|
27 |
+
- Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
|
28 |
+
- Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨💻 developing)
|
29 |
+
|
30 |
+
We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
|
31 |
+
|
32 |
+
1. Pretrained Models Preparation
|
33 |
+
2. Audio Data Preparation
|
34 |
+
3. Evaluation
|
35 |
+
|
36 |
+
## 1. Pretrained Models Preparation
|
37 |
+
|
38 |
+
If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
|
39 |
+
|
40 |
+
## 2. Aduio Data Preparation
|
41 |
+
|
42 |
+
Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
|
43 |
+
|
44 |
+
```plaintext
|
45 |
+
┣ {ref_dir}
|
46 |
+
┃ ┣ sample1.wav
|
47 |
+
┃ ┣ sample2.wav
|
48 |
+
┣ {gen_dir}
|
49 |
+
┃ ┣ sample1.wav
|
50 |
+
┃ ┣ sample2.wav
|
51 |
+
```
|
52 |
+
|
53 |
+
You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
|
54 |
+
|
55 |
+
## 3. Evaluation
|
56 |
+
|
57 |
+
Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
|
58 |
+
|
59 |
+
```bash
|
60 |
+
cd Amphion
|
61 |
+
sh egs/metrics/run.sh \
|
62 |
+
--reference_folder [Your path to the reference audios] \
|
63 |
+
--generated_folder [Your path to the generated audios] \
|
64 |
+
--dump_folder [Your path to dump the objective results] \
|
65 |
+
--metrics [The metrics you need] \
|
66 |
+
--fs [Optional. To calculate all metrics in the specified sampling rate]
|
67 |
+
```
|
68 |
+
|
69 |
+
As for the metrics, an example is provided below:
|
70 |
+
|
71 |
+
```bash
|
72 |
+
--metrics "mcd pesq fad"
|
73 |
+
```
|
74 |
+
|
75 |
+
All currently available metrics keywords are listed below:
|
76 |
+
|
77 |
+
| Keys | Description |
|
78 |
+
| --------------------- | ------------------------------------------ |
|
79 |
+
| `fpc` | F0 Pearson Coefficients |
|
80 |
+
| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
|
81 |
+
| `f0rmse` | F0 Root Mean Square Error |
|
82 |
+
| `v_uv_f1` | Voiced/Unvoiced F1 Score |
|
83 |
+
| `energy_rmse` | Energy Root Mean Square Error |
|
84 |
+
| `energy_pc` | Energy Pearson Coefficients |
|
85 |
+
| `cer` | Character Error Rate |
|
86 |
+
| `wer` | Word Error Rate |
|
87 |
+
| `speaker_similarity` | Cos Similarity based on RawNet3 |
|
88 |
+
| `fad` | Frechet Audio Distance |
|
89 |
+
| `mcd` | Mel Cepstral Distortion |
|
90 |
+
| `mstft` | Multi-Resolution STFT Distance |
|
91 |
+
| `pesq` | Perceptual Evaluation of Speech Quality |
|
92 |
+
| `si_sdr` | Scale Invariant Signal to Distortion Ratio |
|
93 |
+
| `si_snr` | Scale Invariant Signal to Noise Ratio |
|
94 |
+
| `stoi` | Short Time Objective Intelligibility |
|
egs/metrics/run.sh
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $exp_dir))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Parse the Given Parameters from the Commond ###########
|
15 |
+
options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs: -- "$@")
|
16 |
+
eval set -- "$options"
|
17 |
+
|
18 |
+
while true; do
|
19 |
+
case $1 in
|
20 |
+
# Reference Audio Folder
|
21 |
+
--reference_folder) shift; ref_dir=$1 ; shift ;;
|
22 |
+
# Generated Audio Folder
|
23 |
+
--generated_folder) shift; deg_dir=$1 ; shift ;;
|
24 |
+
# Result Dumping Folder
|
25 |
+
--dump_folder) shift; dump_dir=$1 ; shift ;;
|
26 |
+
# Metrics to Compute
|
27 |
+
--metrics) shift; metrics=$1 ; shift ;;
|
28 |
+
# Sampling Rate
|
29 |
+
--fs) shift; fs=$1 ; shift ;;
|
30 |
+
|
31 |
+
--) shift ; break ;;
|
32 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
33 |
+
esac
|
34 |
+
done
|
35 |
+
|
36 |
+
######## Calculate Objective Metrics ###########
|
37 |
+
CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
|
38 |
+
--ref_dir $ref_dir \
|
39 |
+
--deg_dir $deg_dir \
|
40 |
+
--dump_dir $dump_dir \
|
41 |
+
--metrics $metrics \
|
42 |
+
--fs $fs \
|
egs/svc/DiffComoSVC/README.md
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
|
2 |
+
<br>
|
3 |
+
<div align="center">
|
4 |
+
<img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
|
5 |
+
</div>
|
6 |
+
<br>
|
7 |
+
|
8 |
+
This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
|
9 |
+
|
10 |
+
* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
|
11 |
+
* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
|
12 |
+
|
13 |
+
There are five stages in total:
|
14 |
+
|
15 |
+
1. Data preparation
|
16 |
+
2. Features extraction
|
17 |
+
3. Teacher Model Training
|
18 |
+
4. Consistency Distillation
|
19 |
+
5. Inference/conversion
|
20 |
+
|
21 |
+
## 1. Data Preparation
|
22 |
+
|
23 |
+
### Dataset Download
|
24 |
+
|
25 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
26 |
+
|
27 |
+
### Configuration
|
28 |
+
|
29 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
30 |
+
|
31 |
+
```json
|
32 |
+
"dataset": [
|
33 |
+
"m4singer",
|
34 |
+
"opencpop",
|
35 |
+
"opensinger",
|
36 |
+
"svcc",
|
37 |
+
"vctk"
|
38 |
+
],
|
39 |
+
"dataset_path": {
|
40 |
+
// TODO: Fill in your dataset path
|
41 |
+
"m4singer": "[M4Singer dataset path]",
|
42 |
+
"opencpop": "[Opencpop dataset path]",
|
43 |
+
"opensinger": "[OpenSinger dataset path]",
|
44 |
+
"svcc": "[SVCC dataset path]",
|
45 |
+
"vctk": "[VCTK dataset path]"
|
46 |
+
},
|
47 |
+
```
|
48 |
+
|
49 |
+
## 2. Features Extraction
|
50 |
+
|
51 |
+
### Content-based Pretrained Models Download
|
52 |
+
|
53 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
54 |
+
|
55 |
+
### Configuration
|
56 |
+
|
57 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
58 |
+
|
59 |
+
```json
|
60 |
+
// TODO: Fill in the output log path
|
61 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
62 |
+
"preprocess": {
|
63 |
+
// TODO: Fill in the output data path
|
64 |
+
"processed_dir": "[Your path to save processed data]",
|
65 |
+
...
|
66 |
+
},
|
67 |
+
```
|
68 |
+
|
69 |
+
### Run
|
70 |
+
|
71 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
72 |
+
|
73 |
+
```bash
|
74 |
+
cd Amphion
|
75 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 1
|
76 |
+
```
|
77 |
+
|
78 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
79 |
+
|
80 |
+
## 3. Teacher Model Training
|
81 |
+
|
82 |
+
### Configuration
|
83 |
+
|
84 |
+
Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
|
85 |
+
|
86 |
+
```JSON
|
87 |
+
"comosvc":{
|
88 |
+
"distill": false,
|
89 |
+
// conformer encoder
|
90 |
+
"input_dim": 384,
|
91 |
+
"output_dim": 100,
|
92 |
+
"n_heads": 2,
|
93 |
+
"n_layers": 6,
|
94 |
+
"filter_channels":512,
|
95 |
+
// karras diffusion
|
96 |
+
"P_mean": -1.2,
|
97 |
+
"P_std": 1.2,
|
98 |
+
"sigma_data": 0.5,
|
99 |
+
"sigma_min": 0.002,
|
100 |
+
"sigma_max": 80,
|
101 |
+
"rho": 7,
|
102 |
+
"n_timesteps": 40,
|
103 |
+
},
|
104 |
+
```
|
105 |
+
|
106 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
107 |
+
|
108 |
+
```json
|
109 |
+
"train": {
|
110 |
+
"batch_size": 32,
|
111 |
+
...
|
112 |
+
"adamw": {
|
113 |
+
"lr": 2.0e-4
|
114 |
+
},
|
115 |
+
...
|
116 |
+
}
|
117 |
+
```
|
118 |
+
|
119 |
+
### Run
|
120 |
+
|
121 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
122 |
+
|
123 |
+
```bash
|
124 |
+
cd Amphion
|
125 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
126 |
+
```
|
127 |
+
|
128 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
129 |
+
|
130 |
+
```bash
|
131 |
+
cd Amphion
|
132 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
133 |
+
```
|
134 |
+
|
135 |
+
## 4. Consistency Distillation
|
136 |
+
|
137 |
+
### Configuration
|
138 |
+
|
139 |
+
Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
|
140 |
+
|
141 |
+
```JSON
|
142 |
+
"model": {
|
143 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
144 |
+
...
|
145 |
+
"comosvc":{
|
146 |
+
"distill": true,
|
147 |
+
// conformer encoder
|
148 |
+
"input_dim": 384,
|
149 |
+
"output_dim": 100,
|
150 |
+
"n_heads": 2,
|
151 |
+
"n_layers": 6,
|
152 |
+
"filter_channels":512,
|
153 |
+
// karras diffusion
|
154 |
+
"P_mean": -1.2,
|
155 |
+
"P_std": 1.2,
|
156 |
+
"sigma_data": 0.5,
|
157 |
+
"sigma_min": 0.002,
|
158 |
+
"sigma_max": 80,
|
159 |
+
"rho": 7,
|
160 |
+
"n_timesteps": 40,
|
161 |
+
},
|
162 |
+
```
|
163 |
+
|
164 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
165 |
+
|
166 |
+
```json
|
167 |
+
"train": {
|
168 |
+
"batch_size": 32,
|
169 |
+
...
|
170 |
+
"adamw": {
|
171 |
+
"lr": 2.0e-4
|
172 |
+
},
|
173 |
+
...
|
174 |
+
}
|
175 |
+
```
|
176 |
+
|
177 |
+
### Run
|
178 |
+
|
179 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
180 |
+
|
181 |
+
```bash
|
182 |
+
cd Amphion
|
183 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
184 |
+
```
|
185 |
+
|
186 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
187 |
+
|
188 |
+
```bash
|
189 |
+
cd Amphion
|
190 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
191 |
+
```
|
192 |
+
|
193 |
+
## 5. Inference/Conversion
|
194 |
+
|
195 |
+
### Pretrained Vocoder Download
|
196 |
+
|
197 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
198 |
+
|
199 |
+
### Run
|
200 |
+
|
201 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
202 |
+
|
203 |
+
| Parameters | Description | Example |
|
204 |
+
| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
205 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
206 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
207 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
208 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
209 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
210 |
+
|
211 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
cd Amphion
|
215 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
|
216 |
+
--infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
|
217 |
+
--infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
|
218 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
219 |
+
--infer_target_speaker "opencpop_female1" \
|
220 |
+
--infer_key_shift "autoshift"
|
221 |
+
```
|
222 |
+
Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
|
223 |
+
```json
|
224 |
+
"inference": {
|
225 |
+
"comosvc": {
|
226 |
+
"inference_steps": 40
|
227 |
+
}
|
228 |
+
}
|
229 |
+
```
|
230 |
+
|
231 |
+
# Reference
|
232 |
+
https://github.com/zhenye234/CoMoSpeech
|
233 |
+
|
234 |
+
https://github.com/openai/consistency_models
|
egs/svc/DiffComoSVC/exp_config.json
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/comosvc.json",
|
3 |
+
"model_type": "DiffComoSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path
|
20 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path
|
23 |
+
"processed_dir": "[Your path to save processed data]",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
53 |
+
"condition_encoder": {
|
54 |
+
// Config for features usage
|
55 |
+
"use_whisper": true,
|
56 |
+
"use_contentvec": true,
|
57 |
+
"use_wenet": false,
|
58 |
+
"whisper_dim": 1024,
|
59 |
+
"contentvec_dim": 256,
|
60 |
+
"wenet_dim": 512,
|
61 |
+
"use_singer_encoder": false,
|
62 |
+
"pitch_min": 50,
|
63 |
+
"pitch_max": 1100
|
64 |
+
},
|
65 |
+
"comosvc":{
|
66 |
+
"distill": false,
|
67 |
+
// conformer encoder
|
68 |
+
"input_dim": 384,
|
69 |
+
"output_dim": 100,
|
70 |
+
"n_heads": 2,
|
71 |
+
"n_layers": 6,
|
72 |
+
"filter_channels":512,
|
73 |
+
"dropout":0.1,
|
74 |
+
// karras diffusion
|
75 |
+
"P_mean": -1.2,
|
76 |
+
"P_std": 1.2,
|
77 |
+
"sigma_data": 0.5,
|
78 |
+
"sigma_min": 0.002,
|
79 |
+
"sigma_max": 80,
|
80 |
+
"rho": 7,
|
81 |
+
"n_timesteps": 40,
|
82 |
+
},
|
83 |
+
"diffusion": {
|
84 |
+
// Diffusion steps encoder
|
85 |
+
"step_encoder": {
|
86 |
+
"dim_raw_embedding": 128,
|
87 |
+
"dim_hidden_layer": 512,
|
88 |
+
"activation": "SiLU",
|
89 |
+
"num_layer": 2,
|
90 |
+
"max_period": 10000
|
91 |
+
},
|
92 |
+
// Diffusion decoder
|
93 |
+
"model_type": "bidilconv",
|
94 |
+
// bidilconv, unet2d, TODO: unet1d
|
95 |
+
"bidilconv": {
|
96 |
+
"base_channel": 384,
|
97 |
+
"n_res_block": 20,
|
98 |
+
"conv_kernel_size": 3,
|
99 |
+
"dilation_cycle_length": 4,
|
100 |
+
// specially, 1 means no dilation
|
101 |
+
"conditioner_size": 100
|
102 |
+
}
|
103 |
+
}
|
104 |
+
},
|
105 |
+
"train": {
|
106 |
+
"batch_size": 64,
|
107 |
+
"gradient_accumulation_step": 1,
|
108 |
+
"max_epoch": -1, // -1 means no limit
|
109 |
+
"save_checkpoint_stride": [
|
110 |
+
50,
|
111 |
+
50
|
112 |
+
],
|
113 |
+
"keep_last": [
|
114 |
+
5,
|
115 |
+
-1
|
116 |
+
],
|
117 |
+
"run_eval": [
|
118 |
+
false,
|
119 |
+
true
|
120 |
+
],
|
121 |
+
"adamw": {
|
122 |
+
"lr": 4.0e-4
|
123 |
+
},
|
124 |
+
"reducelronplateau": {
|
125 |
+
"factor": 0.8,
|
126 |
+
"patience": 10,
|
127 |
+
"min_lr": 1.0e-4
|
128 |
+
},
|
129 |
+
"dataloader": {
|
130 |
+
"num_worker": 8,
|
131 |
+
"pin_memory": true
|
132 |
+
},
|
133 |
+
"sampler": {
|
134 |
+
"holistic_shuffle": false,
|
135 |
+
"drop_last": true
|
136 |
+
}
|
137 |
+
},
|
138 |
+
"inference": {
|
139 |
+
"comosvc": {
|
140 |
+
"inference_steps": 40
|
141 |
+
}
|
142 |
+
}
|
143 |
+
}
|
egs/svc/DiffComoSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/MultipleContentsSVC/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
|
2 |
+
|
3 |
+
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
|
4 |
+
[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
|
5 |
+
|
6 |
+
<br>
|
7 |
+
<div align="center">
|
8 |
+
<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
|
9 |
+
</div>
|
10 |
+
<br>
|
11 |
+
|
12 |
+
This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
|
13 |
+
|
14 |
+
- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
|
15 |
+
- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
16 |
+
- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
|
17 |
+
|
18 |
+
There are four stages in total:
|
19 |
+
|
20 |
+
1. Data preparation
|
21 |
+
2. Features extraction
|
22 |
+
3. Training
|
23 |
+
4. Inference/conversion
|
24 |
+
|
25 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
26 |
+
> ```bash
|
27 |
+
> cd Amphion
|
28 |
+
> ```
|
29 |
+
|
30 |
+
## 1. Data Preparation
|
31 |
+
|
32 |
+
### Dataset Download
|
33 |
+
|
34 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
35 |
+
|
36 |
+
### Configuration
|
37 |
+
|
38 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
39 |
+
|
40 |
+
```json
|
41 |
+
"dataset": [
|
42 |
+
"m4singer",
|
43 |
+
"opencpop",
|
44 |
+
"opensinger",
|
45 |
+
"svcc",
|
46 |
+
"vctk"
|
47 |
+
],
|
48 |
+
"dataset_path": {
|
49 |
+
// TODO: Fill in your dataset path
|
50 |
+
"m4singer": "[M4Singer dataset path]",
|
51 |
+
"opencpop": "[Opencpop dataset path]",
|
52 |
+
"opensinger": "[OpenSinger dataset path]",
|
53 |
+
"svcc": "[SVCC dataset path]",
|
54 |
+
"vctk": "[VCTK dataset path]"
|
55 |
+
},
|
56 |
+
```
|
57 |
+
|
58 |
+
## 2. Features Extraction
|
59 |
+
|
60 |
+
### Content-based Pretrained Models Download
|
61 |
+
|
62 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
63 |
+
|
64 |
+
### Configuration
|
65 |
+
|
66 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
67 |
+
|
68 |
+
```json
|
69 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
70 |
+
"log_dir": "ckpts/svc",
|
71 |
+
"preprocess": {
|
72 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
73 |
+
"processed_dir": "data",
|
74 |
+
...
|
75 |
+
},
|
76 |
+
```
|
77 |
+
|
78 |
+
### Run
|
79 |
+
|
80 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
81 |
+
|
82 |
+
```bash
|
83 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 1
|
84 |
+
```
|
85 |
+
|
86 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
87 |
+
|
88 |
+
## 3. Training
|
89 |
+
|
90 |
+
### Configuration
|
91 |
+
|
92 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
93 |
+
|
94 |
+
```json
|
95 |
+
"train": {
|
96 |
+
"batch_size": 32,
|
97 |
+
...
|
98 |
+
"adamw": {
|
99 |
+
"lr": 2.0e-4
|
100 |
+
},
|
101 |
+
...
|
102 |
+
}
|
103 |
+
```
|
104 |
+
|
105 |
+
### Run
|
106 |
+
|
107 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
108 |
+
|
109 |
+
```bash
|
110 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
|
111 |
+
```
|
112 |
+
|
113 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
114 |
+
|
115 |
+
## 4. Inference/Conversion
|
116 |
+
|
117 |
+
### Pretrained Vocoder Download
|
118 |
+
|
119 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
120 |
+
|
121 |
+
### Run
|
122 |
+
|
123 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
124 |
+
|
125 |
+
| Parameters | Description | Example |
|
126 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
127 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
128 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
129 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
130 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
131 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
132 |
+
|
133 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
134 |
+
|
135 |
+
```bash
|
136 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
|
137 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
138 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
139 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
140 |
+
--infer_target_speaker "opencpop_female1" \
|
141 |
+
--infer_key_shift "autoshift"
|
142 |
+
```
|
143 |
+
|
144 |
+
## Citations
|
145 |
+
|
146 |
+
```bibtex
|
147 |
+
@article{zhang2023leveraging,
|
148 |
+
title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
|
149 |
+
author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
|
150 |
+
journal={Machine Learning for Audio Worshop, NeurIPS 2023},
|
151 |
+
year={2023}
|
152 |
+
}
|
153 |
+
```
|
egs/svc/MultipleContentsSVC/exp_config.json
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/diffusion.json",
|
3 |
+
"model_type": "DiffWaveNetSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"condition_encoder": {
|
53 |
+
// Config for features usage
|
54 |
+
"use_whisper": true,
|
55 |
+
"use_contentvec": true,
|
56 |
+
"use_wenet": false,
|
57 |
+
"whisper_dim": 1024,
|
58 |
+
"contentvec_dim": 256,
|
59 |
+
"wenet_dim": 512,
|
60 |
+
"use_singer_encoder": false,
|
61 |
+
"pitch_min": 50,
|
62 |
+
"pitch_max": 1100
|
63 |
+
},
|
64 |
+
"diffusion": {
|
65 |
+
"scheduler": "ddpm",
|
66 |
+
"scheduler_settings": {
|
67 |
+
"num_train_timesteps": 1000,
|
68 |
+
"beta_start": 1.0e-4,
|
69 |
+
"beta_end": 0.02,
|
70 |
+
"beta_schedule": "linear"
|
71 |
+
},
|
72 |
+
// Diffusion steps encoder
|
73 |
+
"step_encoder": {
|
74 |
+
"dim_raw_embedding": 128,
|
75 |
+
"dim_hidden_layer": 512,
|
76 |
+
"activation": "SiLU",
|
77 |
+
"num_layer": 2,
|
78 |
+
"max_period": 10000
|
79 |
+
},
|
80 |
+
// Diffusion decoder
|
81 |
+
"model_type": "bidilconv",
|
82 |
+
// bidilconv, unet2d, TODO: unet1d
|
83 |
+
"bidilconv": {
|
84 |
+
"base_channel": 512,
|
85 |
+
"n_res_block": 40,
|
86 |
+
"conv_kernel_size": 3,
|
87 |
+
"dilation_cycle_length": 4,
|
88 |
+
// specially, 1 means no dilation
|
89 |
+
"conditioner_size": 384
|
90 |
+
}
|
91 |
+
}
|
92 |
+
},
|
93 |
+
"train": {
|
94 |
+
"batch_size": 32,
|
95 |
+
"gradient_accumulation_step": 1,
|
96 |
+
"max_epoch": -1, // -1 means no limit
|
97 |
+
"save_checkpoint_stride": [
|
98 |
+
3,
|
99 |
+
50
|
100 |
+
],
|
101 |
+
"keep_last": [
|
102 |
+
3,
|
103 |
+
2
|
104 |
+
],
|
105 |
+
"run_eval": [
|
106 |
+
true,
|
107 |
+
true
|
108 |
+
],
|
109 |
+
"adamw": {
|
110 |
+
"lr": 2.0e-4
|
111 |
+
},
|
112 |
+
"reducelronplateau": {
|
113 |
+
"factor": 0.8,
|
114 |
+
"patience": 30,
|
115 |
+
"min_lr": 1.0e-4
|
116 |
+
},
|
117 |
+
"dataloader": {
|
118 |
+
"num_worker": 8,
|
119 |
+
"pin_memory": true
|
120 |
+
},
|
121 |
+
"sampler": {
|
122 |
+
"holistic_shuffle": false,
|
123 |
+
"drop_last": true
|
124 |
+
}
|
125 |
+
}
|
126 |
+
}
|
egs/svc/MultipleContentsSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Singing Voice Conversion (SVC) Recipe
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
|
6 |
+
|
7 |
+
## Supported Model Architectures
|
8 |
+
|
9 |
+
The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
|
10 |
+
|
11 |
+
<br>
|
12 |
+
<div align="center">
|
13 |
+
<img src="../../imgs/svc/pipeline.png" width="70%">
|
14 |
+
</div>
|
15 |
+
<br>
|
16 |
+
|
17 |
+
Until now, Amphion SVC has supported the following features and models:
|
18 |
+
|
19 |
+
- **Speaker-agnostic Representations**:
|
20 |
+
- Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
|
21 |
+
- Prosody Features: F0 and energy.
|
22 |
+
- **Speaker Embeddings**:
|
23 |
+
- Speaker Look-Up Table.
|
24 |
+
- Reference Encoder (👨💻 developing): It can be used for zero-shot SVC.
|
25 |
+
- **Acoustic Decoders**:
|
26 |
+
- Diffusion-based models:
|
27 |
+
- **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
28 |
+
- **[DiffComoSVC](DiffComoSVC)** (👨💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
|
29 |
+
- Transformer-based models:
|
30 |
+
- **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
|
31 |
+
- VAE- and Flow-based models:
|
32 |
+
- **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
|
33 |
+
- **Waveform Synthesizers (Vocoders)**:
|
34 |
+
- The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
|
egs/svc/TransformerSVC/README.md
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Transformer for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/TransformerSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
|
79 |
+
```json
|
80 |
+
"model": {
|
81 |
+
...
|
82 |
+
"transformer":{
|
83 |
+
// 'conformer' or 'transformer'
|
84 |
+
"type": "conformer",
|
85 |
+
"input_dim": 384,
|
86 |
+
"output_dim": 100,
|
87 |
+
"n_heads": 2,
|
88 |
+
"n_layers": 6,
|
89 |
+
"filter_channels":512,
|
90 |
+
"dropout":0.1,
|
91 |
+
}
|
92 |
+
}
|
93 |
+
```
|
94 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
95 |
+
|
96 |
+
```json
|
97 |
+
"train": {
|
98 |
+
"batch_size": 32,
|
99 |
+
...
|
100 |
+
"adamw": {
|
101 |
+
"lr": 2.0e-4
|
102 |
+
},
|
103 |
+
...
|
104 |
+
}
|
105 |
+
```
|
106 |
+
|
107 |
+
### Run
|
108 |
+
|
109 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
110 |
+
|
111 |
+
```bash
|
112 |
+
sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
|
113 |
+
```
|
114 |
+
|
115 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
116 |
+
|
117 |
+
## 4. Inference/Conversion
|
118 |
+
|
119 |
+
### Pretrained Vocoder Download
|
120 |
+
|
121 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
122 |
+
|
123 |
+
### Run
|
124 |
+
|
125 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
126 |
+
|
127 |
+
| Parameters | Description | Example |
|
128 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
129 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
130 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
131 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
132 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
133 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
134 |
+
|
135 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
136 |
+
|
137 |
+
```bash
|
138 |
+
cd Amphion
|
139 |
+
sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
|
140 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
141 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
142 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
143 |
+
--infer_target_speaker "opencpop_female1" \
|
144 |
+
--infer_key_shift "autoshift"
|
145 |
+
```
|
146 |
+
|
147 |
+
## Citations
|
148 |
+
|
149 |
+
```bibtex
|
150 |
+
@inproceedings{transformer,
|
151 |
+
author = {Ashish Vaswani and
|
152 |
+
Noam Shazeer and
|
153 |
+
Niki Parmar and
|
154 |
+
Jakob Uszkoreit and
|
155 |
+
Llion Jones and
|
156 |
+
Aidan N. Gomez and
|
157 |
+
Lukasz Kaiser and
|
158 |
+
Illia Polosukhin},
|
159 |
+
title = {Attention is All you Need},
|
160 |
+
booktitle = {{NIPS}},
|
161 |
+
pages = {5998--6008},
|
162 |
+
year = {2017}
|
163 |
+
}
|
164 |
+
```
|
egs/svc/TransformerSVC/exp_config.json
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/transformer.json",
|
3 |
+
"model_type": "TransformerSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"condition_encoder": {
|
53 |
+
// Config for features usage
|
54 |
+
"use_whisper": true,
|
55 |
+
"use_contentvec": true,
|
56 |
+
"use_wenet": false,
|
57 |
+
"whisper_dim": 1024,
|
58 |
+
"contentvec_dim": 256,
|
59 |
+
"wenet_dim": 512,
|
60 |
+
"use_singer_encoder": false,
|
61 |
+
"pitch_min": 50,
|
62 |
+
"pitch_max": 1100
|
63 |
+
},
|
64 |
+
"transformer": {
|
65 |
+
// 'conformer' or 'transformer'
|
66 |
+
"type": "conformer",
|
67 |
+
"input_dim": 384,
|
68 |
+
"output_dim": 100,
|
69 |
+
"n_heads": 2,
|
70 |
+
"n_layers": 6,
|
71 |
+
"filter_channels": 512,
|
72 |
+
"dropout": 0.1,
|
73 |
+
}
|
74 |
+
},
|
75 |
+
"train": {
|
76 |
+
"batch_size": 64,
|
77 |
+
"gradient_accumulation_step": 1,
|
78 |
+
"max_epoch": -1, // -1 means no limit
|
79 |
+
"save_checkpoint_stride": [
|
80 |
+
50,
|
81 |
+
50
|
82 |
+
],
|
83 |
+
"keep_last": [
|
84 |
+
5,
|
85 |
+
-1
|
86 |
+
],
|
87 |
+
"run_eval": [
|
88 |
+
false,
|
89 |
+
true
|
90 |
+
],
|
91 |
+
"adamw": {
|
92 |
+
"lr": 4.0e-4
|
93 |
+
},
|
94 |
+
"reducelronplateau": {
|
95 |
+
"factor": 0.8,
|
96 |
+
"patience": 10,
|
97 |
+
"min_lr": 1.0e-4
|
98 |
+
},
|
99 |
+
"dataloader": {
|
100 |
+
"num_worker": 8,
|
101 |
+
"pin_memory": true
|
102 |
+
},
|
103 |
+
"sampler": {
|
104 |
+
"holistic_shuffle": false,
|
105 |
+
"drop_last": true
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
egs/svc/TransformerSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/VitsSVC/README.md
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VITS for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/VitsSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
|
79 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
80 |
+
|
81 |
+
```json
|
82 |
+
"train": {
|
83 |
+
"batch_size": 32,
|
84 |
+
...
|
85 |
+
"adamw": {
|
86 |
+
"lr": 2.0e-4
|
87 |
+
},
|
88 |
+
...
|
89 |
+
}
|
90 |
+
```
|
91 |
+
|
92 |
+
### Run
|
93 |
+
|
94 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
95 |
+
|
96 |
+
```bash
|
97 |
+
sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
|
98 |
+
```
|
99 |
+
|
100 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
101 |
+
|
102 |
+
## 4. Inference/Conversion
|
103 |
+
|
104 |
+
### Run
|
105 |
+
|
106 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
107 |
+
|
108 |
+
| Parameters | Description | Example |
|
109 |
+
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
110 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
111 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
112 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
113 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
114 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
115 |
+
|
116 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
117 |
+
|
118 |
+
```bash
|
119 |
+
sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
|
120 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
121 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
122 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
123 |
+
--infer_target_speaker "opencpop_female1" \
|
124 |
+
--infer_key_shift "autoshift"
|
125 |
+
```
|
egs/svc/VitsSVC/exp_config.json
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/vitssvc.json",
|
3 |
+
"model_type": "VitsSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
|
25 |
+
"f0_min": 50,
|
26 |
+
"f0_max": 1100,
|
27 |
+
// f0_bin in sovits
|
28 |
+
"pitch_bin": 256,
|
29 |
+
// filter_length in sovits
|
30 |
+
"n_fft": 2048,
|
31 |
+
// hop_length in sovits
|
32 |
+
"hop_size": 512,
|
33 |
+
// win_length in sovits
|
34 |
+
"win_size": 2048,
|
35 |
+
"segment_size": 8192,
|
36 |
+
"n_mel": 100,
|
37 |
+
"sample_rate": 44100,
|
38 |
+
|
39 |
+
// Config for features extraction
|
40 |
+
"extract_mel": true,
|
41 |
+
"extract_pitch": true,
|
42 |
+
"pitch_extractor": "parselmouth",
|
43 |
+
"extract_energy": false,
|
44 |
+
"extract_uv": true,
|
45 |
+
"extract_linear_spec": true,
|
46 |
+
"extract_audio": true,
|
47 |
+
// contentvec
|
48 |
+
"extract_contentvec_feature": true,
|
49 |
+
"contentvec_sample_rate": 16000,
|
50 |
+
"contentvec_batch_size": 1,
|
51 |
+
"contentvec_frameshift": 0.02,
|
52 |
+
// whisper
|
53 |
+
"extract_whisper_feature": true,
|
54 |
+
"whisper_sample_rate": 16000,
|
55 |
+
"whisper_frameshift": 0.01,
|
56 |
+
"whisper_downsample_rate": 2,
|
57 |
+
// Fill in the content-based pretrained model's path
|
58 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
59 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
60 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
61 |
+
"whisper_model": "medium",
|
62 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
63 |
+
// Config for features usage
|
64 |
+
"use_mel": true,
|
65 |
+
"use_frame_pitch": true,
|
66 |
+
"use_uv": true,
|
67 |
+
"use_spkid": true,
|
68 |
+
"use_contentvec": true,
|
69 |
+
"use_whisper": true,
|
70 |
+
"use_text": false,
|
71 |
+
"use_phone": false,
|
72 |
+
|
73 |
+
// Extract content features using dataloader
|
74 |
+
"pin_memory": true,
|
75 |
+
"num_workers": 8,
|
76 |
+
"content_feature_batch_size": 16,
|
77 |
+
// Meta file
|
78 |
+
"train_file": "train.json",
|
79 |
+
"valid_file": "test.json",
|
80 |
+
"spk2id": "singers.json",
|
81 |
+
"utt2spk": "utt2singer"
|
82 |
+
},
|
83 |
+
"model": {
|
84 |
+
"condition_encoder": {
|
85 |
+
// Config for features usage
|
86 |
+
"merge_mode": "add",
|
87 |
+
"input_melody_dim": 1,
|
88 |
+
"use_log_f0": true,
|
89 |
+
"n_bins_melody": 256,
|
90 |
+
//# Quantization (0 for not quantization)
|
91 |
+
"output_melody_dim": 192,
|
92 |
+
|
93 |
+
"use_contentvec": true,
|
94 |
+
"use_whisper": true,
|
95 |
+
"use_mert": false,
|
96 |
+
"use_wenet": false,
|
97 |
+
"whisper_dim": 1024,
|
98 |
+
"contentvec_dim": 256,
|
99 |
+
"content_encoder_dim": 192,
|
100 |
+
"output_singer_dim": 192,
|
101 |
+
"singer_table_size": 512,
|
102 |
+
"output_content_dim": 192,
|
103 |
+
"use_spkid": true,
|
104 |
+
|
105 |
+
"pitch_max": 1100.0,
|
106 |
+
"pitch_min": 50.0,
|
107 |
+
},
|
108 |
+
"vits": {
|
109 |
+
"inter_channels": 192,
|
110 |
+
"hidden_channels": 192,
|
111 |
+
"filter_channels": 256,
|
112 |
+
"n_heads": 2,
|
113 |
+
"n_layers": 6,
|
114 |
+
"kernel_size": 3,
|
115 |
+
"p_dropout": 0.1,
|
116 |
+
"ssl_dim": 256,
|
117 |
+
"n_flow_layer": 4,
|
118 |
+
"n_layers_q": 3,
|
119 |
+
"gin_channels": 256,
|
120 |
+
"n_speakers": 512,
|
121 |
+
"use_spectral_norm": false,
|
122 |
+
},
|
123 |
+
"generator": "nsfhifigan",
|
124 |
+
},
|
125 |
+
"train": {
|
126 |
+
"batch_size": 32,
|
127 |
+
"learning_rate": 2e-4,
|
128 |
+
"gradient_accumulation_step": 1,
|
129 |
+
"max_epoch": -1, // -1 means no limit
|
130 |
+
"save_checkpoint_stride": [
|
131 |
+
3,
|
132 |
+
50
|
133 |
+
],
|
134 |
+
"keep_last": [
|
135 |
+
3,
|
136 |
+
2
|
137 |
+
],
|
138 |
+
"run_eval": [
|
139 |
+
true,
|
140 |
+
true
|
141 |
+
],
|
142 |
+
"adamw": {
|
143 |
+
"lr": 2.0e-4
|
144 |
+
},
|
145 |
+
"reducelronplateau": {
|
146 |
+
"factor": 0.8,
|
147 |
+
"patience": 30,
|
148 |
+
"min_lr": 1.0e-4
|
149 |
+
},
|
150 |
+
"dataloader": {
|
151 |
+
"num_worker": 8,
|
152 |
+
"pin_memory": true
|
153 |
+
},
|
154 |
+
"sampler": {
|
155 |
+
"holistic_shuffle": false,
|
156 |
+
"drop_last": true
|
157 |
+
}
|
158 |
+
},
|
159 |
+
"inference": {
|
160 |
+
"batch_size": 1,
|
161 |
+
}
|
162 |
+
}
|
egs/svc/VitsSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/_template/run.sh
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Parse the Given Parameters from the Commond ###########
|
15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
|
16 |
+
eval set -- "$options"
|
17 |
+
|
18 |
+
while true; do
|
19 |
+
case $1 in
|
20 |
+
# Experimental Configuration File
|
21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
22 |
+
# Experimental Name
|
23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
24 |
+
# Running Stage
|
25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
26 |
+
# Visible GPU machines. The default value is "0".
|
27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
28 |
+
|
29 |
+
# [Only for Training] Resume configuration
|
30 |
+
--resume) shift; resume=$1 ; shift ;;
|
31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
32 |
+
--resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
|
33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
35 |
+
|
36 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
37 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
38 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
39 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
40 |
+
# [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
|
41 |
+
--infer_source_file) shift; infer_source_file=$1 ; shift ;;
|
42 |
+
--infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
|
43 |
+
# [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
|
44 |
+
--infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
|
45 |
+
# [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
|
46 |
+
--infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
|
47 |
+
# [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
|
48 |
+
--infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
|
49 |
+
|
50 |
+
--) shift ; break ;;
|
51 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
52 |
+
esac
|
53 |
+
done
|
54 |
+
|
55 |
+
|
56 |
+
### Value check ###
|
57 |
+
if [ -z "$running_stage" ]; then
|
58 |
+
echo "[Error] Please specify the running stage"
|
59 |
+
exit 1
|
60 |
+
fi
|
61 |
+
|
62 |
+
if [ -z "$exp_config" ]; then
|
63 |
+
exp_config="${exp_dir}"/exp_config.json
|
64 |
+
fi
|
65 |
+
echo "Exprimental Configuration File: $exp_config"
|
66 |
+
|
67 |
+
if [ -z "$gpu" ]; then
|
68 |
+
gpu="0"
|
69 |
+
fi
|
70 |
+
|
71 |
+
######## Features Extraction ###########
|
72 |
+
if [ $running_stage -eq 1 ]; then
|
73 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
|
74 |
+
--config $exp_config \
|
75 |
+
--num_workers 4
|
76 |
+
fi
|
77 |
+
|
78 |
+
######## Training ###########
|
79 |
+
if [ $running_stage -eq 2 ]; then
|
80 |
+
if [ -z "$exp_name" ]; then
|
81 |
+
echo "[Error] Please specify the experiments name"
|
82 |
+
exit 1
|
83 |
+
fi
|
84 |
+
echo "Exprimental Name: $exp_name"
|
85 |
+
|
86 |
+
if [ "$resume" = true ]; then
|
87 |
+
echo "Automatically resume from the experimental dir..."
|
88 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
|
89 |
+
--config "$exp_config" \
|
90 |
+
--exp_name "$exp_name" \
|
91 |
+
--log_level info \
|
92 |
+
--resume
|
93 |
+
else
|
94 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
|
95 |
+
--config "$exp_config" \
|
96 |
+
--exp_name "$exp_name" \
|
97 |
+
--log_level info \
|
98 |
+
--resume_from_ckpt_path "$resume_from_ckpt_path" \
|
99 |
+
--resume_type "$resume_type"
|
100 |
+
fi
|
101 |
+
fi
|
102 |
+
|
103 |
+
######## Inference/Conversion ###########
|
104 |
+
if [ $running_stage -eq 3 ]; then
|
105 |
+
if [ -z "$infer_expt_dir" ]; then
|
106 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
107 |
+
exit 1
|
108 |
+
fi
|
109 |
+
|
110 |
+
if [ -z "$infer_output_dir" ]; then
|
111 |
+
infer_output_dir="$expt_dir/result"
|
112 |
+
fi
|
113 |
+
|
114 |
+
if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
|
115 |
+
echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
|
116 |
+
exit 1
|
117 |
+
fi
|
118 |
+
|
119 |
+
if [ -z "$infer_source_file" ]; then
|
120 |
+
infer_source=$infer_source_audio_dir
|
121 |
+
fi
|
122 |
+
|
123 |
+
if [ -z "$infer_source_audio_dir" ]; then
|
124 |
+
infer_source=$infer_source_file
|
125 |
+
fi
|
126 |
+
|
127 |
+
if [ -z "$infer_target_speaker" ]; then
|
128 |
+
echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
|
129 |
+
exit 1
|
130 |
+
fi
|
131 |
+
|
132 |
+
if [ -z "$infer_key_shift" ]; then
|
133 |
+
infer_key_shift="autoshift"
|
134 |
+
fi
|
135 |
+
|
136 |
+
if [ -z "$infer_vocoder_dir" ]; then
|
137 |
+
infer_vocoder_dir="$work_dir"/pretrained/bigvgan
|
138 |
+
echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
|
139 |
+
fi
|
140 |
+
|
141 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
|
142 |
+
--config $exp_config \
|
143 |
+
--acoustics_dir $infer_expt_dir \
|
144 |
+
--vocoder_dir $infer_vocoder_dir \
|
145 |
+
--target_singer $infer_target_speaker \
|
146 |
+
--trans_key $infer_key_shift \
|
147 |
+
--source $infer_source \
|
148 |
+
--output_dir $infer_output_dir \
|
149 |
+
--log_level debug
|
150 |
+
fi
|
egs/tta/README.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Text-to-Audio (TTA) Recipe
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
|
6 |
+
|
7 |
+
## Supported Model Architectures
|
8 |
+
|
9 |
+
Until now, Amphion has supported a latent diffusion based text-to-audio model:
|
10 |
+
|
11 |
+
<br>
|
12 |
+
<div align="center">
|
13 |
+
<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
|
14 |
+
</div>
|
15 |
+
<br>
|
16 |
+
|
17 |
+
Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
|
18 |
+
1. Training the VAE which is called `AutoencoderKL` in Amphion.
|
19 |
+
2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.
|
egs/tta/RECIPE.md
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text-to-Audio with Latent Diffusion Model
|
2 |
+
|
3 |
+
This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
|
4 |
+
|
5 |
+
<br>
|
6 |
+
<div align="center">
|
7 |
+
<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
|
8 |
+
</div>
|
9 |
+
<br>
|
10 |
+
|
11 |
+
We train this latent diffusion model in two stages:
|
12 |
+
1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
|
13 |
+
the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
|
14 |
+
1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
|
15 |
+
|
16 |
+
There are four stages in total for training the text-to-audio model:
|
17 |
+
|
18 |
+
1. Data preparation and processing
|
19 |
+
2. Train the VAE model
|
20 |
+
3. Train the latent diffusion model
|
21 |
+
4. Inference
|
22 |
+
|
23 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
24 |
+
> ```bash
|
25 |
+
> cd Amphion
|
26 |
+
> ```
|
27 |
+
|
28 |
+
## Overview
|
29 |
+
|
30 |
+
```sh
|
31 |
+
# Train the VAE model
|
32 |
+
sh egs/tta/autoencoderkl/run_train.sh
|
33 |
+
|
34 |
+
# Train the latent diffusion model
|
35 |
+
sh egs/tta/audioldm/run_train.sh
|
36 |
+
|
37 |
+
# Inference
|
38 |
+
sh egs/tta/audioldm/run_inference.sh
|
39 |
+
```
|
40 |
+
|
41 |
+
## 1. Data preparation and processing
|
42 |
+
|
43 |
+
### Dataset Download
|
44 |
+
|
45 |
+
We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
|
46 |
+
|
47 |
+
<!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
|
48 |
+
<!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
|
49 |
+
|
50 |
+
### Data Processing
|
51 |
+
|
52 |
+
- Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
|
53 |
+
|
54 |
+
```json
|
55 |
+
{
|
56 |
+
"dataset": [
|
57 |
+
"AudioCaps"
|
58 |
+
],
|
59 |
+
"preprocess": {
|
60 |
+
// Specify the output root path to save the processed data
|
61 |
+
"processed_dir": "[Your path to save tta dataset]",
|
62 |
+
...
|
63 |
+
}
|
64 |
+
}
|
65 |
+
```
|
66 |
+
|
67 |
+
The folder structure of your downloaded data should be similar to:
|
68 |
+
|
69 |
+
```plaintext
|
70 |
+
.../[Your path to save tta dataset]
|
71 |
+
┣ AudioCpas
|
72 |
+
┃ ┣ wav
|
73 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
74 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
75 |
+
┃ ┃ ┣ ...
|
76 |
+
```
|
77 |
+
|
78 |
+
- Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
|
79 |
+
|
80 |
+
- Generate a json file to save the metadata, the json file is like:
|
81 |
+
|
82 |
+
```json
|
83 |
+
[
|
84 |
+
{
|
85 |
+
"Dataset": "AudioCaps",
|
86 |
+
"Uid": "---1_cCGK4M_0_10000",
|
87 |
+
"Caption": "Idling car, train blows horn and passes"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"Dataset": "AudioCaps",
|
91 |
+
"Uid": "---lTs1dxhU_30000_40000",
|
92 |
+
"Caption": "A racing vehicle engine is heard passing by"
|
93 |
+
},
|
94 |
+
...
|
95 |
+
]
|
96 |
+
```
|
97 |
+
- Finally, the folder structure is like:
|
98 |
+
|
99 |
+
```plaintext
|
100 |
+
.../[Your path to save tta dataset]
|
101 |
+
┣ AudioCpas
|
102 |
+
┃ ┣ wav
|
103 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
104 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
105 |
+
┃ ┃ ┣ ...
|
106 |
+
┃ ┣ mel
|
107 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.npy
|
108 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy
|
109 |
+
┃ ┃ ┣ ...
|
110 |
+
┃ ┣ train.json
|
111 |
+
┃ ┣ valid.json
|
112 |
+
┃ ┣ ...
|
113 |
+
```
|
114 |
+
|
115 |
+
## 2. Training the VAE Model
|
116 |
+
|
117 |
+
The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
|
118 |
+
|
119 |
+
```sh
|
120 |
+
sh egs/tta/autoencoderkl/run_train.sh
|
121 |
+
```
|
122 |
+
|
123 |
+
## 3. Training the Latent Diffusion Model
|
124 |
+
|
125 |
+
The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
|
126 |
+
|
127 |
+
```sh
|
128 |
+
sh egs/tta/audioldm/run_train.sh
|
129 |
+
```
|
130 |
+
|
131 |
+
## 4. Inference
|
132 |
+
|
133 |
+
Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
|
134 |
+
|
135 |
+
```sh
|
136 |
+
sh egs/tta/audioldm/run_inference.sh \
|
137 |
+
--text "A man is whistling"
|
138 |
+
```
|
139 |
+
|
140 |
+
## Citations
|
141 |
+
|
142 |
+
```bibtex
|
143 |
+
@article{wang2023audit,
|
144 |
+
title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
|
145 |
+
author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
|
146 |
+
journal={NeurIPS 2023},
|
147 |
+
year={2023}
|
148 |
+
}
|
149 |
+
|
150 |
+
@article{liu2023audioldm,
|
151 |
+
title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
|
152 |
+
author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
|
153 |
+
journal={Proceedings of the International Conference on Machine Learning},
|
154 |
+
year={2023}
|
155 |
+
}
|
156 |
+
```
|
egs/tta/audioldm/exp_config.json
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/audioldm/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
// For example: "/home/TTADataset/processed_data"
|
10 |
+
|
11 |
+
// feature
|
12 |
+
"use_spkid": false,
|
13 |
+
"use_uv": false,
|
14 |
+
"use_frame_pitch": false,
|
15 |
+
"use_phone_pitch": false,
|
16 |
+
"use_frame_energy": false,
|
17 |
+
"use_phone_energy": false,
|
18 |
+
"use_mel": false,
|
19 |
+
"use_audio": false,
|
20 |
+
"use_label": false,
|
21 |
+
"use_one_hot": false,
|
22 |
+
// feature for text to audio
|
23 |
+
"use_caption": true,
|
24 |
+
"use_melspec": true,
|
25 |
+
"use_wav": false,
|
26 |
+
// feature dir
|
27 |
+
"melspec_dir": "mel",
|
28 |
+
"wav_dir": "wav"
|
29 |
+
},
|
30 |
+
// Specify the output root path to save model ckpts and logs
|
31 |
+
"log_dir": "ckpts/tta",
|
32 |
+
// For example: "/home/TTADataset/processed_data/logs"
|
33 |
+
|
34 |
+
// model
|
35 |
+
"model": {
|
36 |
+
"audioldm": {
|
37 |
+
"image_size": 32,
|
38 |
+
"in_channels": 4,
|
39 |
+
"out_channels": 4,
|
40 |
+
"model_channels": 256,
|
41 |
+
"attention_resolutions": [4, 2, 1],
|
42 |
+
"num_res_blocks": 2,
|
43 |
+
"channel_mult": [1, 2, 4],
|
44 |
+
"num_heads": 8,
|
45 |
+
"use_spatial_transformer": true,
|
46 |
+
"transformer_depth": 1,
|
47 |
+
"context_dim": 768,
|
48 |
+
"use_checkpoint": true,
|
49 |
+
"legacy": false
|
50 |
+
},
|
51 |
+
"autoencoderkl": {
|
52 |
+
"ch": 128,
|
53 |
+
"ch_mult": [1,1,2,2,4],
|
54 |
+
"num_res_blocks": 2,
|
55 |
+
"in_channels": 1,
|
56 |
+
"z_channels": 4,
|
57 |
+
"out_ch": 1,
|
58 |
+
"double_z": true
|
59 |
+
},
|
60 |
+
"noise_scheduler": {
|
61 |
+
"num_train_timesteps": 1000,
|
62 |
+
"beta_start": 0.00085,
|
63 |
+
"beta_end": 0.012,
|
64 |
+
"beta_schedule": "scaled_linear",
|
65 |
+
"clip_sample": false,
|
66 |
+
"steps_offset": 1,
|
67 |
+
"set_alpha_to_one": false,
|
68 |
+
"skip_prk_steps": true,
|
69 |
+
"prediction_type": "epsilon"
|
70 |
+
},
|
71 |
+
"autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
|
72 |
+
},
|
73 |
+
|
74 |
+
// train
|
75 |
+
"train": {
|
76 |
+
"adam": {
|
77 |
+
"lr": 5.0e-5
|
78 |
+
},
|
79 |
+
"ddp": false,
|
80 |
+
"random_seed": 12345,
|
81 |
+
"batch_size": 12,
|
82 |
+
"epochs": 50000,
|
83 |
+
"max_steps": 1000000,
|
84 |
+
"total_training_steps": 800000,
|
85 |
+
"save_summary_steps": 1000,
|
86 |
+
"save_checkpoints_steps": 5000,
|
87 |
+
"valid_interval": 5000,
|
88 |
+
"keep_checkpoint_max": 100
|
89 |
+
}
|
90 |
+
}
|
egs/tta/audioldm/exp_config_base.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/audioldm.json",
|
3 |
+
"model_type": "AudioLDM",
|
4 |
+
"dataset": [
|
5 |
+
"AudioCaps"
|
6 |
+
],
|
7 |
+
"preprocess": {
|
8 |
+
"train_file": "train.json",
|
9 |
+
"valid_file": "vaild.json"
|
10 |
+
}
|
11 |
+
}
|
egs/tta/audioldm/exp_config_latent_4_10_78.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/audioldm/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spkid": false,
|
12 |
+
"use_uv": false,
|
13 |
+
"use_frame_pitch": false,
|
14 |
+
"use_phone_pitch": false,
|
15 |
+
"use_frame_energy": false,
|
16 |
+
"use_phone_energy": false,
|
17 |
+
"use_mel": false,
|
18 |
+
"use_audio": false,
|
19 |
+
"use_label": false,
|
20 |
+
"use_one_hot": false,
|
21 |
+
// feature for text to audio
|
22 |
+
"use_caption": true,
|
23 |
+
"use_melspec": true,
|
24 |
+
"use_wav": false,
|
25 |
+
// feature dir
|
26 |
+
"melspec_dir": "mel",
|
27 |
+
"wav_dir": "wav"
|
28 |
+
},
|
29 |
+
// Specify the output root path to save model ckpts and logs
|
30 |
+
"log_dir": "ckpts/tta",
|
31 |
+
|
32 |
+
// model
|
33 |
+
"model": {
|
34 |
+
"audioldm": {
|
35 |
+
"image_size": 32,
|
36 |
+
"in_channels": 4,
|
37 |
+
"out_channels": 4,
|
38 |
+
"model_channels": 256,
|
39 |
+
"attention_resolutions": [4, 2, 1],
|
40 |
+
"num_res_blocks": 2,
|
41 |
+
"channel_mult": [1, 2, 4],
|
42 |
+
"num_heads": 8,
|
43 |
+
"use_spatial_transformer": true,
|
44 |
+
"transformer_depth": 1,
|
45 |
+
"context_dim": 768,
|
46 |
+
"use_checkpoint": true,
|
47 |
+
"legacy": false
|
48 |
+
},
|
49 |
+
"autoencoderkl": {
|
50 |
+
"ch": 128,
|
51 |
+
"ch_mult": [1,2,2,4],
|
52 |
+
"num_res_blocks": 2,
|
53 |
+
"in_channels": 1,
|
54 |
+
"z_channels": 4,
|
55 |
+
"out_ch": 1,
|
56 |
+
"double_z": true
|
57 |
+
},
|
58 |
+
"noise_scheduler": {
|
59 |
+
"num_train_timesteps": 1000,
|
60 |
+
"beta_start": 0.00085,
|
61 |
+
"beta_end": 0.012,
|
62 |
+
"beta_schedule": "scaled_linear",
|
63 |
+
"clip_sample": false,
|
64 |
+
"steps_offset": 1,
|
65 |
+
"set_alpha_to_one": false,
|
66 |
+
"skip_prk_steps": true,
|
67 |
+
"prediction_type": "epsilon"
|
68 |
+
},
|
69 |
+
"autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
|
70 |
+
},
|
71 |
+
|
72 |
+
// train
|
73 |
+
"train": {
|
74 |
+
"adam": {
|
75 |
+
"lr": 2.0e-5
|
76 |
+
},
|
77 |
+
"ddp": false,
|
78 |
+
"random_seed": 12345,
|
79 |
+
"batch_size": 12,
|
80 |
+
"epochs": 50000,
|
81 |
+
"max_steps": 1000000,
|
82 |
+
"total_training_steps": 800000,
|
83 |
+
"save_summary_steps": 1000,
|
84 |
+
"save_checkpoints_steps": 5000,
|
85 |
+
"valid_interval": 5000,
|
86 |
+
"keep_checkpoint_max": 100
|
87 |
+
}
|
88 |
+
}
|
egs/tta/audioldm/run_inference.sh
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_5_39"
|
17 |
+
checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
|
18 |
+
output_dir="$work_dir/temp"
|
19 |
+
vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
|
20 |
+
vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
|
21 |
+
num_steps=200
|
22 |
+
guidance_scale=4.0
|
23 |
+
|
24 |
+
export CUDA_VISIBLE_DEVICES="0"
|
25 |
+
|
26 |
+
######## Parse Command Line Arguments ###########
|
27 |
+
while [[ $# -gt 0 ]]
|
28 |
+
do
|
29 |
+
key="$1"
|
30 |
+
|
31 |
+
case $key in
|
32 |
+
--text)
|
33 |
+
text="$2"
|
34 |
+
shift # past argument
|
35 |
+
shift # past value
|
36 |
+
;;
|
37 |
+
*) # unknown option
|
38 |
+
shift # past argument
|
39 |
+
;;
|
40 |
+
esac
|
41 |
+
done
|
42 |
+
|
43 |
+
######## Run inference ###########
|
44 |
+
python "${work_dir}"/bins/tta/inference.py \
|
45 |
+
--config=$exp_config \
|
46 |
+
--checkpoint_path=$checkpoint_path \
|
47 |
+
--text="$text" \
|
48 |
+
--vocoder_path=$vocoder_path \
|
49 |
+
--vocoder_config_path=$vocoder_config_path \
|
50 |
+
--num_steps=$num_steps \
|
51 |
+
--guidance_scale=$guidance_scale \
|
52 |
+
--output_dir=$output_dir
|
egs/tta/audioldm/run_inference_latent_4_10_78.sh
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_v2.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_10_78"
|
17 |
+
checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
|
18 |
+
output_dir="$work_dir/temp"
|
19 |
+
vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
|
20 |
+
vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
|
21 |
+
num_steps=200
|
22 |
+
guidance_scale=4.0
|
23 |
+
|
24 |
+
export CUDA_VISIBLE_DEVICES="0"
|
25 |
+
|
26 |
+
######## Parse Command Line Arguments ###########
|
27 |
+
while [[ $# -gt 0 ]]
|
28 |
+
do
|
29 |
+
key="$1"
|
30 |
+
|
31 |
+
case $key in
|
32 |
+
--text)
|
33 |
+
text="$2"
|
34 |
+
shift # past argument
|
35 |
+
shift # past value
|
36 |
+
;;
|
37 |
+
*) # unknown option
|
38 |
+
shift # past argument
|
39 |
+
;;
|
40 |
+
esac
|
41 |
+
done
|
42 |
+
|
43 |
+
######## Run inference ###########
|
44 |
+
python "${work_dir}"/bins/tta/inference.py \
|
45 |
+
--config=$exp_config \
|
46 |
+
--checkpoint_path=$checkpoint_path \
|
47 |
+
--text="A man is whistling" \
|
48 |
+
--vocoder_path=$vocoder_path \
|
49 |
+
--vocoder_config_path=$vocoder_config_path \
|
50 |
+
--num_steps=$num_steps \
|
51 |
+
--guidance_scale=$guidance_scale \
|
52 |
+
--output_dir=$output_dir \
|
egs/tta/audioldm/run_train.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_5_39"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/audioldm/run_train_latent_4_10_78.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_latent_4_10_78.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_10_78"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/autoencoderkl/exp_config.json
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/autoencoderkl/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spk": false,
|
12 |
+
"use_spkid": false,
|
13 |
+
"use_uv": false,
|
14 |
+
"use_frame_pitch": false,
|
15 |
+
"use_phone_pitch": false,
|
16 |
+
"use_frame_energy": false,
|
17 |
+
"use_phone_energy": false,
|
18 |
+
"use_mel": false,
|
19 |
+
"use_audio": false,
|
20 |
+
"use_label": false,
|
21 |
+
"use_one_hot": false,
|
22 |
+
// feature for text to audio
|
23 |
+
"use_caption": true,
|
24 |
+
"use_melspec": true,
|
25 |
+
"use_wav": false,
|
26 |
+
// feature dir
|
27 |
+
"melspec_dir": "mel",
|
28 |
+
"wav_dir": "wav"
|
29 |
+
},
|
30 |
+
// Specify the output root path to save model ckpts and logs
|
31 |
+
"log_dir": "ckpts/tta",
|
32 |
+
|
33 |
+
// train
|
34 |
+
"train": {
|
35 |
+
"adam": {
|
36 |
+
"lr": 4.0e-5
|
37 |
+
},
|
38 |
+
"ddp": false,
|
39 |
+
"random_seed": 12345,
|
40 |
+
"batch_size": 12,
|
41 |
+
"epochs": 50000,
|
42 |
+
"max_steps": 1000000,
|
43 |
+
"total_training_steps": 800000,
|
44 |
+
"save_summary_steps": 1000,
|
45 |
+
"save_checkpoints_steps": 5000,
|
46 |
+
"valid_interval": 5000,
|
47 |
+
"keep_checkpoint_max": 100
|
48 |
+
}
|
49 |
+
}
|
egs/tta/autoencoderkl/exp_config_base.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/autoencoderkl.json",
|
3 |
+
"model_type": "AutoencoderKL",
|
4 |
+
"dataset": [
|
5 |
+
"AudioCaps"
|
6 |
+
],
|
7 |
+
"preprocess": {
|
8 |
+
"train_file": "train.json",
|
9 |
+
"valid_file": "vaild.json"
|
10 |
+
}
|
11 |
+
}
|
egs/tta/autoencoderkl/exp_config_latent_4_10_78.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/autoencoderkl/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spkid": false,
|
12 |
+
"use_uv": false,
|
13 |
+
"use_frame_pitch": false,
|
14 |
+
"use_phone_pitch": false,
|
15 |
+
"use_frame_energy": false,
|
16 |
+
"use_phone_energy": false,
|
17 |
+
"use_mel": false,
|
18 |
+
"use_audio": false,
|
19 |
+
"use_label": false,
|
20 |
+
"use_one_hot": false,
|
21 |
+
// feature for text to audio
|
22 |
+
"use_caption": true,
|
23 |
+
"use_melspec": true,
|
24 |
+
"use_wav": false,
|
25 |
+
// feature dir
|
26 |
+
"melspec_dir": "mel",
|
27 |
+
"wav_dir": "wav"
|
28 |
+
},
|
29 |
+
// Specify the output root path to save model ckpts and logs
|
30 |
+
"log_dir": "ckpts/tta",
|
31 |
+
|
32 |
+
"model": {
|
33 |
+
"autoencoderkl": {
|
34 |
+
"ch": 128,
|
35 |
+
"ch_mult": [1,2,2,4],
|
36 |
+
"num_res_blocks": 2,
|
37 |
+
"in_channels": 1,
|
38 |
+
"z_channels": 4,
|
39 |
+
"out_ch": 1,
|
40 |
+
"double_z": true
|
41 |
+
}
|
42 |
+
},
|
43 |
+
// train
|
44 |
+
"train": {
|
45 |
+
"adam": {
|
46 |
+
"lr": 4.0e-5
|
47 |
+
},
|
48 |
+
"ddp": false,
|
49 |
+
"random_seed": 12345,
|
50 |
+
"batch_size": 12,
|
51 |
+
"epochs": 50000,
|
52 |
+
"max_steps": 1000000,
|
53 |
+
"total_training_steps": 800000,
|
54 |
+
"save_summary_steps": 1000,
|
55 |
+
"save_checkpoints_steps": 5000,
|
56 |
+
"valid_interval": 5000,
|
57 |
+
"keep_checkpoint_max": 100
|
58 |
+
}
|
59 |
+
}
|
egs/tta/autoencoderkl/run_train.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="autoencoder_kl_debug"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/autoencoderkl/run_train_latent_4_10_78.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_latent_4_10_78.json"
|
16 |
+
exp_name="autoencoder_kl_debug_latent_size_4_10_78"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tts/FastSpeech2/README.md
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# FastSpeech2 Recipe
|
3 |
+
|
4 |
+
In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
|
5 |
+
|
6 |
+
There are four stages in total:
|
7 |
+
|
8 |
+
1. Data preparation
|
9 |
+
2. Features extraction
|
10 |
+
3. Training
|
11 |
+
4. Inference
|
12 |
+
|
13 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
14 |
+
> ```bash
|
15 |
+
> cd Amphion
|
16 |
+
> ```
|
17 |
+
|
18 |
+
## 1. Data Preparation
|
19 |
+
|
20 |
+
### Dataset Download
|
21 |
+
You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"LJSpeech",
|
30 |
+
],
|
31 |
+
"dataset_path": {
|
32 |
+
// TODO: Fill in your dataset path
|
33 |
+
"LJSpeech": "[LJSpeech dataset path]",
|
34 |
+
},
|
35 |
+
```
|
36 |
+
|
37 |
+
## 2. Features Extraction
|
38 |
+
|
39 |
+
### Configuration
|
40 |
+
|
41 |
+
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
|
42 |
+
|
43 |
+
```json
|
44 |
+
// TODO: Fill in the output log path
|
45 |
+
"log_dir": "ckpts/tts",
|
46 |
+
"preprocess": {
|
47 |
+
// TODO: Fill in the output data path
|
48 |
+
"processed_dir": "data",
|
49 |
+
...
|
50 |
+
},
|
51 |
+
```
|
52 |
+
|
53 |
+
### Run
|
54 |
+
|
55 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`):
|
56 |
+
|
57 |
+
```bash
|
58 |
+
sh egs/tts/FastSpeech2/run.sh --stage 1
|
59 |
+
```
|
60 |
+
|
61 |
+
## 3. Training
|
62 |
+
|
63 |
+
### Configuration
|
64 |
+
|
65 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
|
66 |
+
|
67 |
+
```
|
68 |
+
"train": {
|
69 |
+
"batch_size": 16,
|
70 |
+
}
|
71 |
+
```
|
72 |
+
|
73 |
+
### Run
|
74 |
+
|
75 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
|
76 |
+
|
77 |
+
```bash
|
78 |
+
sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName]
|
79 |
+
```
|
80 |
+
|
81 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
82 |
+
|
83 |
+
|
84 |
+
## 4. Inference
|
85 |
+
|
86 |
+
### Configuration
|
87 |
+
|
88 |
+
For inference, you need to specify the following configurations when running `run.sh`:
|
89 |
+
|
90 |
+
|
91 |
+
| Parameters | Description | Example |
|
92 |
+
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
93 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` |
|
94 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` |
|
95 |
+
| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
|
96 |
+
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
|
97 |
+
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
|
98 |
+
| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
|
99 |
+
|
100 |
+
### Run
|
101 |
+
For example, if you want to generate speech of all testing set split from LJSpeech, just run:
|
102 |
+
|
103 |
+
```bash
|
104 |
+
sh egs/tts/FastSpeech2/run.sh --stage 3 \
|
105 |
+
--infer_expt_dir ckpts/tts/[YourExptName] \
|
106 |
+
--infer_output_dir ckpts/tts/[YourExptName]/result \
|
107 |
+
--infer_mode "batch" \
|
108 |
+
--infer_dataset "LJSpeech" \
|
109 |
+
--infer_testing_set "test"
|
110 |
+
```
|
111 |
+
|
112 |
+
Or, if you want to generate a single clip of speech from a given text, just run:
|
113 |
+
|
114 |
+
```bash
|
115 |
+
sh egs/tts/FastSpeech2/run.sh --stage 3 \
|
116 |
+
--infer_expt_dir ckpts/tts/[YourExptName] \
|
117 |
+
--infer_output_dir ckpts/tts/[YourExptName]/result \
|
118 |
+
--infer_mode "single" \
|
119 |
+
--infer_text "This is a clip of generated speech with the given text from a TTS model."
|
120 |
+
```
|
121 |
+
|
122 |
+
We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction.
|
123 |
+
|
124 |
+
|
125 |
+
```bibtex
|
126 |
+
@inproceedings{ren2020fastspeech,
|
127 |
+
title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
|
128 |
+
author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
|
129 |
+
booktitle={International Conference on Learning Representations},
|
130 |
+
year={2020}
|
131 |
+
}
|
132 |
+
```
|
egs/tts/FastSpeech2/exp_config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/fs2.json",
|
3 |
+
"model_type": "FastSpeech2",
|
4 |
+
"dataset": [
|
5 |
+
"LJSpeech"
|
6 |
+
],
|
7 |
+
"dataset_path": {
|
8 |
+
// TODO: Fill in your dataset path
|
9 |
+
"LJSpeech": "[LJSpeech dataset path]"
|
10 |
+
},
|
11 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
|
12 |
+
"log_dir": "ckpts/tts",
|
13 |
+
"preprocess": {
|
14 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
15 |
+
"processed_dir": "data",
|
16 |
+
"sample_rate": 22050,
|
17 |
+
},
|
18 |
+
"train": {
|
19 |
+
"batch_size": 16,
|
20 |
+
}
|
21 |
+
}
|
egs/tts/FastSpeech2/prepare_mfa.sh
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
mkdir mfa
|
8 |
+
cd mfa
|
9 |
+
wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz
|
10 |
+
tar -zxvf montreal-forced-aligner_linux.tar.gz
|
11 |
+
cd mfa
|
12 |
+
mkdir lexicon
|
13 |
+
cd lexicon
|
14 |
+
wget http://www.openslr.org/resources/11/librispeech-lexicon.txt
|
egs/tts/FastSpeech2/run.sh
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
cd $work_dir/modules/monotonic_align
|
15 |
+
mkdir -p monotonic_align
|
16 |
+
python setup.py build_ext --inplace
|
17 |
+
cd $work_dir
|
18 |
+
|
19 |
+
mfa_dir=$work_dir/mfa
|
20 |
+
echo $mfa_dir
|
21 |
+
|
22 |
+
######## Parse the Given Parameters from the Commond ###########
|
23 |
+
# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
|
24 |
+
options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
|
25 |
+
eval set -- "$options"
|
26 |
+
|
27 |
+
while true; do
|
28 |
+
case $1 in
|
29 |
+
# Experimental Configuration File
|
30 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
31 |
+
# Experimental Name
|
32 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
33 |
+
# Running Stage
|
34 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
35 |
+
# Visible GPU machines. The default value is "0".
|
36 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
37 |
+
|
38 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
39 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
40 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
41 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
42 |
+
# [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
|
43 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
44 |
+
# [Only for Inference] The inference dataset. It is only used when the inference model is "batch".
|
45 |
+
--infer_dataset) shift; infer_dataset=$1 ; shift ;;
|
46 |
+
# [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
|
47 |
+
--infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
|
48 |
+
# [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
|
49 |
+
--infer_text) shift; infer_text=$1 ; shift ;;
|
50 |
+
|
51 |
+
--) shift ; break ;;
|
52 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
53 |
+
esac
|
54 |
+
done
|
55 |
+
|
56 |
+
|
57 |
+
### Value check ###
|
58 |
+
if [ -z "$running_stage" ]; then
|
59 |
+
echo "[Error] Please specify the running stage"
|
60 |
+
exit 1
|
61 |
+
fi
|
62 |
+
|
63 |
+
if [ -z "$exp_config" ]; then
|
64 |
+
exp_config="${exp_dir}"/exp_config.json
|
65 |
+
fi
|
66 |
+
echo "Exprimental Configuration File: $exp_config"
|
67 |
+
|
68 |
+
if [ -z "$gpu" ]; then
|
69 |
+
gpu="0"
|
70 |
+
fi
|
71 |
+
|
72 |
+
######## Features Extraction ###########
|
73 |
+
if [ $running_stage -eq 1 ]; then
|
74 |
+
if [ ! -d "$mfa_dir" ]; then
|
75 |
+
bash ${exp_dir}/prepare_mfa.sh
|
76 |
+
fi
|
77 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
|
78 |
+
--config=$exp_config \
|
79 |
+
--num_workers=4 \
|
80 |
+
--prepare_alignment=true
|
81 |
+
fi
|
82 |
+
|
83 |
+
######## Training ###########
|
84 |
+
if [ $running_stage -eq 2 ]; then
|
85 |
+
if [ -z "$exp_name" ]; then
|
86 |
+
echo "[Error] Please specify the experiments name"
|
87 |
+
exit 1
|
88 |
+
fi
|
89 |
+
echo "Exprimental Name: $exp_name"
|
90 |
+
|
91 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \
|
92 |
+
--config $exp_config \
|
93 |
+
--exp_name $exp_name \
|
94 |
+
--log_level debug
|
95 |
+
fi
|
96 |
+
|
97 |
+
######## Inference ###########
|
98 |
+
if [ $running_stage -eq 3 ]; then
|
99 |
+
if [ -z "$infer_expt_dir" ]; then
|
100 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
101 |
+
exit 1
|
102 |
+
fi
|
103 |
+
|
104 |
+
if [ -z "$infer_output_dir" ]; then
|
105 |
+
infer_output_dir="$expt_dir/result"
|
106 |
+
fi
|
107 |
+
|
108 |
+
if [ -z "$infer_mode" ]; then
|
109 |
+
echo "[Error] Please specify the inference mode, e.g., "batch", "single""
|
110 |
+
exit 1
|
111 |
+
fi
|
112 |
+
|
113 |
+
if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then
|
114 |
+
echo "[Error] Please specify the dataset used in inference when the inference mode is batch"
|
115 |
+
exit 1
|
116 |
+
fi
|
117 |
+
|
118 |
+
if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then
|
119 |
+
echo "[Error] Please specify the testing set used in inference when the inference mode is batch"
|
120 |
+
exit 1
|
121 |
+
fi
|
122 |
+
|
123 |
+
if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
|
124 |
+
echo "[Error] Please specify the text to be synthesized when the inference mode is single"
|
125 |
+
exit 1
|
126 |
+
fi
|
127 |
+
|
128 |
+
if [ "$infer_mode" = "single" ]; then
|
129 |
+
echo 'Text: ' ${infer_text}
|
130 |
+
infer_dataset=None
|
131 |
+
infer_testing_set=None
|
132 |
+
elif [ "$infer_mode" = "batch" ]; then
|
133 |
+
infer_text=''
|
134 |
+
fi
|
135 |
+
|
136 |
+
|
137 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
|
138 |
+
--config $exp_config \
|
139 |
+
--acoustics_dir $infer_expt_dir \
|
140 |
+
--output_dir $infer_output_dir \
|
141 |
+
--mode $infer_mode \
|
142 |
+
--dataset $infer_dataset \
|
143 |
+
--testing_set $infer_testing_set \
|
144 |
+
--text "$infer_text" \
|
145 |
+
--log_level debug \
|
146 |
+
--vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
fi
|
egs/tts/NaturalSpeech2/exp_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tts/NaturalSpeech2/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"LibriTTS"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "[LibriTTS dataset path]",
|
9 |
+
"train_file": "train.json",
|
10 |
+
"valid_file": "test.json",
|
11 |
+
"read_metadata": true,
|
12 |
+
"metadata_dir": "metadata"
|
13 |
+
},
|
14 |
+
// Specify the output root path to save model ckpts and logs
|
15 |
+
"log_dir": "ckpts/tts",
|
16 |
+
"train": {
|
17 |
+
// New trainer and Accelerator
|
18 |
+
"gradient_accumulation_step": 1,
|
19 |
+
"tracker": ["tensorboard"],
|
20 |
+
"max_epoch": 5000,
|
21 |
+
"save_checkpoint_stride": [1],
|
22 |
+
"keep_last": [1000],
|
23 |
+
"run_eval": [true],
|
24 |
+
"dataloader": {
|
25 |
+
"num_worker": 16,
|
26 |
+
"pin_memory": true
|
27 |
+
},
|
28 |
+
"adam": {
|
29 |
+
"lr": 1.0e-4
|
30 |
+
},
|
31 |
+
"use_dynamic_batchsize": true,
|
32 |
+
"batch_size": 8,
|
33 |
+
"max_tokens": 7500,
|
34 |
+
"max_sentences": 32,
|
35 |
+
"lr_warmup_steps": 5000,
|
36 |
+
"lr_scheduler": "cosine",
|
37 |
+
"num_train_steps": 800000
|
38 |
+
}
|
39 |
+
}
|
egs/tts/NaturalSpeech2/exp_config_base.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/ns2.json",
|
3 |
+
"model_type": "NaturalSpeech2",
|
4 |
+
"dataset": [
|
5 |
+
"LibriTTS"
|
6 |
+
],
|
7 |
+
"preprocess": {
|
8 |
+
"use_mel": false,
|
9 |
+
"use_code": true,
|
10 |
+
"use_spkid": true,
|
11 |
+
"use_pitch": true,
|
12 |
+
"use_duration": true,
|
13 |
+
"use_phone": true,
|
14 |
+
"use_len": true,
|
15 |
+
"use_cross_reference": true,
|
16 |
+
"train_file": "train.json",
|
17 |
+
"valid_file": "test.json",
|
18 |
+
"melspec_dir": "mel",
|
19 |
+
"code_dir": "code",
|
20 |
+
"pitch_dir": "pitch",
|
21 |
+
"duration_dir": "duration",
|
22 |
+
"metadata_dir": "metadata",
|
23 |
+
"read_metadata": true,
|
24 |
+
"clip_mode": "start"
|
25 |
+
},
|
26 |
+
"model": {
|
27 |
+
"latent_dim": 128,
|
28 |
+
"prior_encoder": {
|
29 |
+
"vocab_size": 100,
|
30 |
+
"pitch_min": 50,
|
31 |
+
"pitch_max": 1100,
|
32 |
+
"pitch_bins_num": 512,
|
33 |
+
"encoder": {
|
34 |
+
"encoder_layer": 6,
|
35 |
+
"encoder_hidden": 512,
|
36 |
+
"encoder_head": 8,
|
37 |
+
"conv_filter_size": 2048,
|
38 |
+
"conv_kernel_size": 9,
|
39 |
+
"encoder_dropout": 0.2,
|
40 |
+
"use_cln": true
|
41 |
+
},
|
42 |
+
"duration_predictor": {
|
43 |
+
"input_size": 512,
|
44 |
+
"filter_size": 512,
|
45 |
+
"kernel_size": 3,
|
46 |
+
"conv_layers": 30,
|
47 |
+
"cross_attn_per_layer": 3,
|
48 |
+
"attn_head": 8,
|
49 |
+
"drop_out": 0.5
|
50 |
+
},
|
51 |
+
"pitch_predictor": {
|
52 |
+
"input_size": 512,
|
53 |
+
"filter_size": 512,
|
54 |
+
"kernel_size": 5,
|
55 |
+
"conv_layers": 30,
|
56 |
+
"cross_attn_per_layer": 3,
|
57 |
+
"attn_head": 8,
|
58 |
+
"drop_out": 0.5
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"diffusion": {
|
62 |
+
"wavenet": {
|
63 |
+
"input_size": 128,
|
64 |
+
"hidden_size": 512,
|
65 |
+
"out_size": 128,
|
66 |
+
"num_layers": 40,
|
67 |
+
"cross_attn_per_layer": 3,
|
68 |
+
"dilation_cycle": 2,
|
69 |
+
"attn_head": 8,
|
70 |
+
"drop_out": 0.2
|
71 |
+
},
|
72 |
+
"beta_min": 0.05,
|
73 |
+
"beta_max": 20,
|
74 |
+
"sigma": 1.0,
|
75 |
+
"noise_factor": 1.0,
|
76 |
+
"ode_solver": "euler",
|
77 |
+
"diffusion_type": "diffusion"
|
78 |
+
},
|
79 |
+
"prompt_encoder": {
|
80 |
+
"encoder_layer": 6,
|
81 |
+
"encoder_hidden": 512,
|
82 |
+
"encoder_head": 8,
|
83 |
+
"conv_filter_size": 2048,
|
84 |
+
"conv_kernel_size": 9,
|
85 |
+
"encoder_dropout": 0.2,
|
86 |
+
"use_cln": false
|
87 |
+
},
|
88 |
+
"query_emb": {
|
89 |
+
"query_token_num": 32,
|
90 |
+
"hidden_size": 512,
|
91 |
+
"head_num": 8
|
92 |
+
},
|
93 |
+
"inference_step": 500
|
94 |
+
},
|
95 |
+
"train": {
|
96 |
+
"use_dynamic_batchsize": true,
|
97 |
+
"max_tokens": 7500,
|
98 |
+
"max_sentences": 32,
|
99 |
+
"lr_warmup_steps": 5000,
|
100 |
+
"lr_scheduler": "cosine",
|
101 |
+
"num_train_steps": 800000,
|
102 |
+
"adam": {
|
103 |
+
"lr": 7.5e-5
|
104 |
+
},
|
105 |
+
"diff_ce_loss_lambda": 0.5,
|
106 |
+
"diff_noise_loss_lambda": 1.0,
|
107 |
+
"ddp": false,
|
108 |
+
"random_seed": 114,
|
109 |
+
"batch_size": 32,
|
110 |
+
"epochs": 5000,
|
111 |
+
"max_steps": 1000000,
|
112 |
+
"total_training_steps": 800000,
|
113 |
+
"save_summary_steps": 500,
|
114 |
+
"save_checkpoints_steps": 2000,
|
115 |
+
"valid_interval": 2000,
|
116 |
+
"keep_checkpoint_max": 100
|
117 |
+
}
|
118 |
+
}
|
egs/tts/NaturalSpeech2/run_inference.sh
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
######## Build Experiment Environment ###########
|
2 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
3 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
4 |
+
|
5 |
+
export WORK_DIR=$work_dir
|
6 |
+
export PYTHONPATH=$work_dir
|
7 |
+
export PYTHONIOENCODING=UTF-8
|
8 |
+
|
9 |
+
######## Set Experiment Configuration ###########
|
10 |
+
exp_config="$exp_dir/exp_config.json"
|
11 |
+
exp_name="ns2_libritts"
|
12 |
+
ref_audio="$work_dir/egs/tts/NaturalSpeech2/prompt_example/ref_audio.wav"
|
13 |
+
checkpoint_path="$work_dir/ckpts/tts/ns2_libritts/checkpoint/epoch-0065_step-0376136_loss-7.126379"
|
14 |
+
output_dir="$work_dir/output"
|
15 |
+
mode="single"
|
16 |
+
|
17 |
+
export CUDA_VISIBLE_DEVICES="0"
|
18 |
+
|
19 |
+
######## Parse Command Line Arguments ###########
|
20 |
+
while [[ $# -gt 0 ]]
|
21 |
+
do
|
22 |
+
key="$1"
|
23 |
+
|
24 |
+
case $key in
|
25 |
+
--text)
|
26 |
+
text="$2"
|
27 |
+
shift # past argument
|
28 |
+
shift # past value
|
29 |
+
;;
|
30 |
+
*) # unknown option
|
31 |
+
shift # past argument
|
32 |
+
;;
|
33 |
+
esac
|
34 |
+
done
|
35 |
+
|
36 |
+
######## Train Model ###########
|
37 |
+
python "${work_dir}"/bins/tts/inference.py \
|
38 |
+
--config=$exp_config \
|
39 |
+
--text="$text" \
|
40 |
+
--mode=$mode \
|
41 |
+
--checkpoint_path=$checkpoint_path \
|
42 |
+
--ref_audio=$ref_audio \
|
43 |
+
--output_dir=$output_dir \
|
egs/tts/NaturalSpeech2/run_train.sh
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
######## Build Experiment Environment ###########
|
2 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
3 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
4 |
+
|
5 |
+
export WORK_DIR=$work_dir
|
6 |
+
export PYTHONPATH=$work_dir
|
7 |
+
export PYTHONIOENCODING=UTF-8
|
8 |
+
|
9 |
+
######## Set Experiment Configuration ###########
|
10 |
+
exp_config="$exp_dir/exp_config.json"
|
11 |
+
exp_name="ns2_libritts"
|
12 |
+
|
13 |
+
######## Train Model ###########
|
14 |
+
CUDA_VISIBLE_DEVICES="0" accelerate \
|
15 |
+
"${work_dir}"/bins/tts/train.py \
|
16 |
+
--config=$exp_config \
|
17 |
+
--exp_name=$exp_name \
|
18 |
+
--log_level debug \
|
egs/tts/README.md
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Amphion Text-to-Speech (TTS) Recipe
|
3 |
+
|
4 |
+
## Quick Start
|
5 |
+
|
6 |
+
We provide a **[beginner recipe](VALLE/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [Vall-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
|
7 |
+
|
8 |
+
## Supported Model Architectures
|
9 |
+
|
10 |
+
Until now, Amphion TTS supports the following models or architectures,
|
11 |
+
- **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
|
12 |
+
- **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
|
13 |
+
- **[Vall-E](VALLE)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
|
14 |
+
- **[NaturalSpeech2](NaturalSpeech2)** (👨💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
|
15 |
+
|
16 |
+
## Amphion TTS Demo
|
17 |
+
Here are some [TTS samples](https://openhlt.github.io/Amphion_TTS_Demo/) from Amphion.
|
egs/tts/VALLE/README.md
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VALL-E Recipe
|
2 |
+
|
3 |
+
In this recipe, we will show how to train [VALL-E](https://arxiv.org/abs/2301.02111) using Amphion's infrastructure. VALL-E is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
You can use the commonly used TTS dataset to train VALL-E model, e.g., LibriTTS, etc. We strongly recommend you use LibriTTS to train VALL-E model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
|
21 |
+
|
22 |
+
### Configuration
|
23 |
+
|
24 |
+
After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
25 |
+
|
26 |
+
```json
|
27 |
+
"dataset": [
|
28 |
+
"libritts",
|
29 |
+
],
|
30 |
+
"dataset_path": {
|
31 |
+
// TODO: Fill in your dataset path
|
32 |
+
"libritts": "[LibriTTS dataset path]",
|
33 |
+
},
|
34 |
+
```
|
35 |
+
|
36 |
+
## 2. Features Extraction
|
37 |
+
|
38 |
+
### Configuration
|
39 |
+
|
40 |
+
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
|
41 |
+
|
42 |
+
```json
|
43 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
|
44 |
+
"log_dir": "ckpts/tts",
|
45 |
+
"preprocess": {
|
46 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
47 |
+
"processed_dir": "data",
|
48 |
+
...
|
49 |
+
},
|
50 |
+
```
|
51 |
+
|
52 |
+
### Run
|
53 |
+
|
54 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`):
|
55 |
+
|
56 |
+
```bash
|
57 |
+
sh egs/tts/VALLE/run.sh --stage 1
|
58 |
+
```
|
59 |
+
|
60 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
61 |
+
|
62 |
+
|
63 |
+
## 3. Training
|
64 |
+
|
65 |
+
### Configuration
|
66 |
+
|
67 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
|
68 |
+
|
69 |
+
```
|
70 |
+
"train": {
|
71 |
+
"batch_size": 4,
|
72 |
+
}
|
73 |
+
```
|
74 |
+
|
75 |
+
### Run
|
76 |
+
|
77 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
|
78 |
+
|
79 |
+
Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model.
|
80 |
+
|
81 |
+
|
82 |
+
Train a AR moel, just run:
|
83 |
+
|
84 |
+
```bash
|
85 |
+
sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName]
|
86 |
+
```
|
87 |
+
|
88 |
+
Train a NAR model, just run:
|
89 |
+
```bash
|
90 |
+
sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
|
91 |
+
```
|
92 |
+
<!-- > **NOTE:** To train a NAR model, `--checkpoint_path` should be set as the ckeckpoint path to the trained AR model. -->
|
93 |
+
|
94 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
95 |
+
|
96 |
+
|
97 |
+
## 4. Inference
|
98 |
+
|
99 |
+
### Configuration
|
100 |
+
|
101 |
+
For inference, you need to specify the following configurations when running `run.sh`:
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
| Parameters | Description | Example |
|
106 |
+
| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
107 |
+
| `--infer_expt_dir` | The experimental directory of NAR model which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` |
|
108 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` |
|
109 |
+
| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
|
110 |
+
| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
|
111 |
+
| `--infer_text_prompt` | The text prompt for inference. | The text prompt should be aligned with the audio prompt. |
|
112 |
+
| `--infer_audio_prompt` | The audio prompt for inference. | The audio prompt should be aligned with text prompt.|
|
113 |
+
| `--test_list_file` | The test list file used for batch inference. | The format of test list file is `text\|text_prompt\|audio_prompt`.|
|
114 |
+
|
115 |
+
|
116 |
+
### Run
|
117 |
+
For example, if you want to generate a single clip of speech, just run:
|
118 |
+
|
119 |
+
```bash
|
120 |
+
sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \
|
121 |
+
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
|
122 |
+
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
|
123 |
+
--infer_mode "single" \
|
124 |
+
--infer_text "This is a clip of generated speech with the given text from a TTS model." \
|
125 |
+
--infer_text_prompt "But even the unsuccessful dramatist has his moments." \
|
126 |
+
--infer_audio_prompt egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav
|
127 |
+
```
|
128 |
+
|
129 |
+
|
130 |
+
We released a pre-trained Amphion VALL-E model. So you can download the pre-trained model [here](https://huggingface.co/amphion/valle-libritts) and generate speech following the above inference instruction.
|
131 |
+
|
132 |
+
```bibtex
|
133 |
+
@article{wang2023neural,
|
134 |
+
title={Neural codec language models are zero-shot text to speech synthesizers},
|
135 |
+
author={Wang, Chengyi and Chen, Sanyuan and Wu, Yu and Zhang, Ziqiang and Zhou, Long and Liu, Shujie and Chen, Zhuo and Liu, Yanqing and Wang, Huaming and Li, Jinyu and others},
|
136 |
+
journal={arXiv preprint arXiv:2301.02111},
|
137 |
+
year={2023}
|
138 |
+
}
|
139 |
+
```
|
egs/tts/VALLE/exp_config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/valle.json",
|
3 |
+
"model_type": "VALLE",
|
4 |
+
"dataset": [
|
5 |
+
"libritts"
|
6 |
+
],
|
7 |
+
"dataset_path": {
|
8 |
+
"libritts": "[LibriTTS dataset path]"
|
9 |
+
},
|
10 |
+
"preprocess": {
|
11 |
+
"extract_phone": true,
|
12 |
+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
13 |
+
"extract_acoustic_token": true,
|
14 |
+
"use_phone": true,
|
15 |
+
"use_acoustic_token": true,
|
16 |
+
"processed_dir": "Amphion/data/",
|
17 |
+
"sample_rate": 24000, // "Audio sampling rate."
|
18 |
+
"codec_hop_size": "320", // "Audio codec hop size."
|
19 |
+
"valid_file": "test.json",
|
20 |
+
},
|
21 |
+
"model": {
|
22 |
+
"prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.",
|
23 |
+
},
|
24 |
+
"log_dir": "Amphion/ckpts/tts/valle",
|
25 |
+
"train": {
|
26 |
+
"batch_size": 4,
|
27 |
+
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
|
28 |
+
"max_epoch": 20, // "Number of epochs to train."
|
29 |
+
"use_dynamic_batchsize": true, // If use dynamic batch size
|
30 |
+
"max_tokens": 4000, // If use dynamic batch size
|
31 |
+
"max_sentences": 10 // If use dynamic batch size
|
32 |
+
}
|
33 |
+
}
|
egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
I almost think I can remember feeling a little different.
|
egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Ten sons sat at meat with him, and I was the youngest.
|
egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The girl entered, and gave an involuntary cry of surprise.
|
egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
But even the unsuccessful dramatist has his moments.
|
egs/tts/VALLE/run.sh
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
cd $work_dir/modules/monotonic_align
|
15 |
+
mkdir -p monotonic_align
|
16 |
+
python setup.py build_ext --inplace
|
17 |
+
cd $work_dir
|
18 |
+
|
19 |
+
######## Parse the Given Parameters from the Commond ###########
|
20 |
+
options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage: -- "$@")
|
21 |
+
eval set -- "$options"
|
22 |
+
|
23 |
+
while true; do
|
24 |
+
case $1 in
|
25 |
+
# Experimental Configuration File
|
26 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
27 |
+
# Experimental Name
|
28 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
29 |
+
# Running Stage
|
30 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
31 |
+
# Visible GPU machines. The default value is "0".
|
32 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
33 |
+
|
34 |
+
# [Only for Training] Model training stage.
|
35 |
+
--model_train_stage) shift; model_train_stage=$1 ; shift ;;
|
36 |
+
# [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
37 |
+
--ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;;
|
38 |
+
|
39 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
40 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
41 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
42 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
43 |
+
|
44 |
+
# [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
|
45 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
46 |
+
# [Only for Inference] The inference test list file. It is only used when the inference model is "batch".
|
47 |
+
--infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;;
|
48 |
+
# [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
|
49 |
+
--infer_text) shift; infer_text=$1 ; shift ;;
|
50 |
+
# [Only for Inference] The inference text prompt. It is only used when the inference model is "single".
|
51 |
+
--infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;;
|
52 |
+
# [Only for Inference] The inference audio prompt. It is only used when the inference model is "single".
|
53 |
+
--infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;;
|
54 |
+
|
55 |
+
--) shift ; break ;;
|
56 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
57 |
+
esac
|
58 |
+
done
|
59 |
+
|
60 |
+
|
61 |
+
### Value check ###
|
62 |
+
if [ -z "$running_stage" ]; then
|
63 |
+
echo "[Error] Please specify the running stage"
|
64 |
+
exit 1
|
65 |
+
fi
|
66 |
+
|
67 |
+
if [ -z "$exp_config" ]; then
|
68 |
+
exp_config="${exp_dir}"/exp_config.json
|
69 |
+
fi
|
70 |
+
echo "Exprimental Configuration File: $exp_config"
|
71 |
+
|
72 |
+
if [ -z "$gpu" ]; then
|
73 |
+
gpu="0"
|
74 |
+
fi
|
75 |
+
|
76 |
+
######## Features Extraction ###########
|
77 |
+
if [ $running_stage -eq 1 ]; then
|
78 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
|
79 |
+
--config=$exp_config \
|
80 |
+
--num_workers=4
|
81 |
+
fi
|
82 |
+
|
83 |
+
######## Training ###########
|
84 |
+
if [ $running_stage -eq 2 ]; then
|
85 |
+
if [ -z "$exp_name" ]; then
|
86 |
+
echo "[Error] Please specify the experiments name"
|
87 |
+
exit 1
|
88 |
+
fi
|
89 |
+
|
90 |
+
if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then
|
91 |
+
echo "[Error] Please specify the ckeckpoint path to the trained model in stage1."
|
92 |
+
exit 1
|
93 |
+
fi
|
94 |
+
|
95 |
+
if [ "$model_train_stage" = "1" ]; then
|
96 |
+
ar_model_ckpt_dir=None
|
97 |
+
fi
|
98 |
+
|
99 |
+
echo "Exprimental Name: $exp_name"
|
100 |
+
|
101 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \
|
102 |
+
"${work_dir}"/bins/tts/train.py \
|
103 |
+
--config $exp_config \
|
104 |
+
--exp_name $exp_name \
|
105 |
+
--log_level debug \
|
106 |
+
--train_stage $model_train_stage \
|
107 |
+
--checkpoint_path $ar_model_ckpt_dir
|
108 |
+
fi
|
109 |
+
|
110 |
+
|
111 |
+
######## Inference ###########
|
112 |
+
if [ $running_stage -eq 3 ]; then
|
113 |
+
if [ -z "$infer_expt_dir" ]; then
|
114 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
115 |
+
exit 1
|
116 |
+
fi
|
117 |
+
|
118 |
+
if [ -z "$infer_output_dir" ]; then
|
119 |
+
infer_output_dir="$expt_dir/result"
|
120 |
+
fi
|
121 |
+
|
122 |
+
if [ -z "$infer_mode" ]; then
|
123 |
+
echo "[Error] Please specify the inference mode, e.g., "batch", "single""
|
124 |
+
exit 1
|
125 |
+
fi
|
126 |
+
|
127 |
+
if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then
|
128 |
+
echo "[Error] Please specify the test list file used in inference when the inference mode is batch"
|
129 |
+
exit 1
|
130 |
+
fi
|
131 |
+
|
132 |
+
if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
|
133 |
+
echo "[Error] Please specify the text to be synthesized when the inference mode is single"
|
134 |
+
exit 1
|
135 |
+
fi
|
136 |
+
|
137 |
+
if [ "$infer_mode" = "single" ]; then
|
138 |
+
echo 'Text: ' ${infer_text}
|
139 |
+
infer_test_list_file=None
|
140 |
+
elif [ "$infer_mode" = "batch" ]; then
|
141 |
+
infer_text=""
|
142 |
+
infer_text_prompt=""
|
143 |
+
infer_audio_prompt=""
|
144 |
+
fi
|
145 |
+
|
146 |
+
|
147 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
|
148 |
+
--config $exp_config \
|
149 |
+
--log_level debug \
|
150 |
+
--acoustics_dir $infer_expt_dir \
|
151 |
+
--output_dir $infer_output_dir \
|
152 |
+
--mode $infer_mode \
|
153 |
+
--text "$infer_text" \
|
154 |
+
--text_prompt "$infer_text_prompt" \
|
155 |
+
--audio_prompt $infer_audio_prompt\
|
156 |
+
--test_list_file $infer_test_list_file \
|
157 |
+
|
158 |
+
fi
|
egs/tts/VITS/README.md
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# VITS Recipe
|
3 |
+
|
4 |
+
In this recipe, we will show how to train [VITS](https://arxiv.org/abs/2106.06103) using Amphion's infrastructure. VITS is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning.
|
5 |
+
|
6 |
+
There are four stages in total:
|
7 |
+
|
8 |
+
1. Data preparation
|
9 |
+
2. Features extraction
|
10 |
+
3. Training
|
11 |
+
4. Inference
|
12 |
+
|
13 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
14 |
+
> ```bash
|
15 |
+
> cd Amphion
|
16 |
+
> ```
|
17 |
+
|
18 |
+
## 1. Data Preparation
|
19 |
+
|
20 |
+
### Dataset Download
|
21 |
+
You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"LJSpeech",
|
30 |
+
],
|
31 |
+
"dataset_path": {
|
32 |
+
// TODO: Fill in your dataset path
|
33 |
+
"LJSpeech": "[LJSpeech dataset path]",
|
34 |
+
},
|
35 |
+
```
|
36 |
+
|
37 |
+
## 2. Features Extraction
|
38 |
+
|
39 |
+
### Configuration
|
40 |
+
|
41 |
+
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
|
42 |
+
|
43 |
+
```json
|
44 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
|
45 |
+
"log_dir": "ckpts/tts",
|
46 |
+
"preprocess": {
|
47 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
48 |
+
"processed_dir": "data",
|
49 |
+
...
|
50 |
+
},
|
51 |
+
```
|
52 |
+
|
53 |
+
### Run
|
54 |
+
|
55 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`):
|
56 |
+
|
57 |
+
```bash
|
58 |
+
sh egs/tts/VITS/run.sh --stage 1
|
59 |
+
```
|
60 |
+
|
61 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
62 |
+
|
63 |
+
## 3. Training
|
64 |
+
|
65 |
+
### Configuration
|
66 |
+
|
67 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
|
68 |
+
|
69 |
+
```
|
70 |
+
"train": {
|
71 |
+
"batch_size": 16,
|
72 |
+
}
|
73 |
+
```
|
74 |
+
|
75 |
+
### Run
|
76 |
+
|
77 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
|
78 |
+
|
79 |
+
```bash
|
80 |
+
sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName]
|
81 |
+
```
|
82 |
+
|
83 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
84 |
+
|
85 |
+
|
86 |
+
## 4. Inference
|
87 |
+
|
88 |
+
### Configuration
|
89 |
+
|
90 |
+
For inference, you need to specify the following configurations when running `run.sh`:
|
91 |
+
|
92 |
+
|
93 |
+
| Parameters | Description | Example |
|
94 |
+
| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
95 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` |
|
96 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` |
|
97 |
+
| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
|
98 |
+
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
|
99 |
+
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
|
100 |
+
| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
|
101 |
+
|
102 |
+
### Run
|
103 |
+
For example, if you want to generate speech of all testing set split from LJSpeech, just run:
|
104 |
+
|
105 |
+
```bash
|
106 |
+
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
|
107 |
+
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
|
108 |
+
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
|
109 |
+
--infer_mode "batch" \
|
110 |
+
--infer_dataset "LJSpeech" \
|
111 |
+
--infer_testing_set "test"
|
112 |
+
```
|
113 |
+
|
114 |
+
Or, if you want to generate a single clip of speech from a given text, just run:
|
115 |
+
|
116 |
+
```bash
|
117 |
+
sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
|
118 |
+
--infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
|
119 |
+
--infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
|
120 |
+
--infer_mode "single" \
|
121 |
+
--infer_text "This is a clip of generated speech with the given text from a TTS model."
|
122 |
+
```
|
123 |
+
|
124 |
+
We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction.
|
125 |
+
|
126 |
+
|
127 |
+
```bibtex
|
128 |
+
@inproceedings{kim2021conditional,
|
129 |
+
title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech},
|
130 |
+
author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee},
|
131 |
+
booktitle={International Conference on Machine Learning},
|
132 |
+
pages={5530--5540},
|
133 |
+
year={2021},
|
134 |
+
}
|
135 |
+
```
|