zyingt commited on
Commit
0d80816
·
1 Parent(s): 43347e6

Upload 685 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +61 -0
  3. LICENSE +21 -0
  4. app.py +84 -0
  5. bins/calc_metrics.py +140 -0
  6. bins/svc/inference.py +262 -0
  7. bins/svc/preprocess.py +182 -0
  8. bins/svc/train.py +107 -0
  9. bins/tta/inference.py +94 -0
  10. bins/tta/preprocess.py +194 -0
  11. bins/tta/train_tta.py +77 -0
  12. bins/tts/inference.py +162 -0
  13. bins/tts/preprocess.py +253 -0
  14. bins/tts/train.py +107 -0
  15. bins/vocoder/inference.py +114 -0
  16. bins/vocoder/preprocess.py +150 -0
  17. bins/vocoder/train.py +90 -0
  18. config/audioldm.json +92 -0
  19. config/autoencoderkl.json +69 -0
  20. config/base.json +220 -0
  21. config/comosvc.json +216 -0
  22. config/diffusion.json +227 -0
  23. config/fs2.json +117 -0
  24. config/transformer.json +180 -0
  25. config/tts.json +26 -0
  26. config/valle.json +52 -0
  27. config/vits.json +101 -0
  28. config/vocoder.json +84 -0
  29. egs/datasets/README.md +381 -0
  30. egs/metrics/README.md +93 -0
  31. egs/metrics/run.sh +40 -0
  32. egs/svc/DiffComoSVC/README.md +234 -0
  33. egs/svc/DiffComoSVC/exp_config.json +143 -0
  34. egs/svc/DiffComoSVC/run.sh +150 -0
  35. egs/svc/MultipleContentsSVC/README.md +153 -0
  36. egs/svc/MultipleContentsSVC/exp_config.json +126 -0
  37. egs/svc/MultipleContentsSVC/run.sh +150 -0
  38. egs/svc/README.md +34 -0
  39. egs/svc/TransformerSVC/README.md +164 -0
  40. egs/svc/TransformerSVC/exp_config.json +108 -0
  41. egs/svc/TransformerSVC/run.sh +150 -0
  42. egs/svc/_template/run.sh +150 -0
  43. egs/tta/README.md +19 -0
  44. egs/tta/RECIPE.md +156 -0
  45. egs/tta/audioldm/exp_config.json +90 -0
  46. egs/tta/audioldm/exp_config_base.json +11 -0
  47. egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
  48. egs/tta/audioldm/run_inference.sh +52 -0
  49. egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
  50. egs/tta/audioldm/run_train.sh +26 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mac OS files
2
+ .DS_Store
3
+
4
+ # IDEs
5
+ .idea
6
+ .vs
7
+ .vscode
8
+ .cache
9
+
10
+ # GitHub files
11
+ .github
12
+
13
+ # Byte-compiled / optimized / DLL / cached files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.pyc
18
+ .temp
19
+ *.c
20
+ *.so
21
+ *.o
22
+
23
+ # Developing mode
24
+ _*.sh
25
+ _*.json
26
+ *.lst
27
+ yard*
28
+ *.out
29
+ evaluation/evalset_selection
30
+ mfa
31
+ egs/svc/*wavmark
32
+ egs/svc/custom
33
+ egs/svc/*/dev*
34
+ egs/svc/dev_exp_config.json
35
+ bins/svc/demo*
36
+ bins/svc/preprocess_custom.py
37
+ data
38
+ ckpts
39
+
40
+ # Data and ckpt
41
+ *.pkl
42
+ *.pt
43
+ *.npy
44
+ *.npz
45
+ *.tar.gz
46
+ *.ckpt
47
+ *.wav
48
+ *.flac
49
+ pretrained/wenet/*conformer_exp
50
+
51
+ # Runtime data dirs
52
+ processed_data
53
+ data
54
+ model_ckpt
55
+ logs
56
+ *.ipynb
57
+ *.lst
58
+ source_audio
59
+ result
60
+ conversion_results
61
+ get_available_gpu.py
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Amphion
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import subprocess
7
+
8
+ command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app"
9
+
10
+ try:
11
+ result = subprocess.check_output(command_to_run, shell=True, text=True)
12
+ print("Command output:")
13
+ print(result)
14
+ except subprocess.CalledProcessError as e:
15
+ print(f"Command failed with return code {e.returncode}")
16
+
17
+ import gradio as gr
18
+ import os
19
+ import inference
20
+
21
+ SUPPORTED_SPEAKERS = {
22
+ "92":"hifitts_92",
23
+ "6097":"hifitts_6097",
24
+ "6670":"hifitts_6670",
25
+ "6671":"hifitts_6671",
26
+ "8051":"hifitts_8051",
27
+ "9017":"hifitts_9017",
28
+ "9136":"hifitts_9136",
29
+ "11614":"hifitts_11614",
30
+ "11697":"hifitts_11697",
31
+ "12787":"hifitts_12787"
32
+ }
33
+
34
+
35
+ def tts_inference(
36
+ input_text,
37
+ target_speaker
38
+ ):
39
+
40
+ args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
41
+ args_list += ["--checkpoint_path", "./latest-checkpoint"]
42
+ args_list += ["--speaker_name", target_speaker]
43
+ args_list += ["--text", input_text]
44
+ args_list += ["--mode","single"]
45
+ args_list += ["--output_dir", "result"]
46
+ args_list += ["--log_level", "debug"]
47
+
48
+ os.environ["WORK_DIR"] = "./"
49
+ inference.main(args_list)
50
+
51
+ ### Display ###
52
+ result_file = os.path.join(
53
+ "result/{}.wav".format(target_speaker)
54
+ )
55
+ return result_file
56
+
57
+
58
+ demo_inputs = [
59
+ gr.Textbox(
60
+ label="Input text",
61
+ type="text",
62
+ lines=1,
63
+ max_lines=20
64
+ ),
65
+ gr.Radio(
66
+ choices=list(SUPPORTED_SPEAKERS.keys()),
67
+ label="Target Speaker",
68
+ value="92"
69
+ )
70
+ ]
71
+
72
+ demo_output = gr.Audio(label="")
73
+
74
+
75
+
76
+ demo = gr.Interface(
77
+ fn=tts_inference,
78
+ inputs=demo_inputs,
79
+ outputs=demo_output,
80
+ title="Amphion HifiTTS Text-to-Speech Demo",
81
+ )
82
+
83
+ if __name__ == "__main__":
84
+ demo.launch(share=True)
bins/calc_metrics.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import numpy as np
8
+ import json
9
+ import argparse
10
+
11
+ from glob import glob
12
+ from tqdm import tqdm
13
+ from collections import defaultdict
14
+
15
+ from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
16
+ from evaluation.metrics.energy.energy_pearson_coefficients import (
17
+ extract_energy_pearson_coeffcients,
18
+ )
19
+ from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
20
+ from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
21
+ from evaluation.metrics.f0.f0_rmse import extract_f0rmse
22
+ from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
23
+ from evaluation.metrics.intelligibility.character_error_rate import extract_cer
24
+ from evaluation.metrics.intelligibility.word_error_rate import extract_wer
25
+ from evaluation.metrics.similarity.speaker_similarity import extract_speaker_similarity
26
+ from evaluation.metrics.spectrogram.frechet_distance import extract_fad
27
+ from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
28
+ from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
29
+ from evaluation.metrics.spectrogram.pesq import extract_pesq
30
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
31
+ extract_si_sdr,
32
+ )
33
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
34
+ extract_si_snr,
35
+ )
36
+ from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
37
+ extract_stoi,
38
+ )
39
+
40
+ METRIC_FUNC = {
41
+ "energy_rmse": extract_energy_rmse,
42
+ "energy_pc": extract_energy_pearson_coeffcients,
43
+ "fpc": extract_fpc,
44
+ "f0_periodicity_rmse": extract_f0_periodicity_rmse,
45
+ "f0rmse": extract_f0rmse,
46
+ "v_uv_f1": extract_f1_v_uv,
47
+ "cer": extract_cer,
48
+ "wer": extract_wer,
49
+ "speaker_similarity": extract_speaker_similarity,
50
+ "fad": extract_fad,
51
+ "mcd": extract_mcd,
52
+ "mstft": extract_mstft,
53
+ "pesq": extract_pesq,
54
+ "si_sdr": extract_si_sdr,
55
+ "si_snr": extract_si_snr,
56
+ "stoi": extract_stoi,
57
+ }
58
+
59
+
60
+ def calc_metric(ref_dir, deg_dir, dump_dir, metrics, fs=None):
61
+ result = defaultdict()
62
+
63
+ for metric in tqdm(metrics):
64
+ if metric in ["fad", "speaker_similarity"]:
65
+ result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir))
66
+ continue
67
+
68
+ audios_ref = []
69
+ audios_deg = []
70
+
71
+ files = glob(ref_dir + "/*.wav")
72
+
73
+ for file in files:
74
+ audios_ref.append(file)
75
+ uid = file.split("/")[-1].split(".wav")[0]
76
+ file_gt = deg_dir + "/{}.wav".format(uid)
77
+ audios_deg.append(file_gt)
78
+
79
+ if metric in ["v_uv_f1"]:
80
+ tp_total = 0
81
+ fp_total = 0
82
+ fn_total = 0
83
+
84
+ for i in tqdm(range(len(audios_ref))):
85
+ audio_ref = audios_ref[i]
86
+ audio_deg = audios_deg[i]
87
+ tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, fs)
88
+ tp_total += tp
89
+ fp_total += fp
90
+ fn_total += fn
91
+
92
+ result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
93
+ else:
94
+ scores = []
95
+
96
+ for i in tqdm(range(len(audios_ref))):
97
+ audio_ref = audios_ref[i]
98
+ audio_deg = audios_deg[i]
99
+
100
+ score = METRIC_FUNC[metric](
101
+ audio_ref=audio_ref, audio_deg=audio_deg, fs=fs
102
+ )
103
+ if not np.isnan(score):
104
+ scores.append(score)
105
+
106
+ scores = np.array(scores)
107
+ result["{}_mean".format(metric)] = str(np.mean(scores))
108
+ result["{}_std".format(metric)] = str(np.std(scores))
109
+
110
+ data = json.dumps(result, indent=4)
111
+
112
+ with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
113
+ f.write(data)
114
+
115
+
116
+ if __name__ == "__main__":
117
+ parser = argparse.ArgumentParser()
118
+ parser.add_argument(
119
+ "--ref_dir",
120
+ type=str,
121
+ help="Path to the target audio folder.",
122
+ )
123
+ parser.add_argument(
124
+ "--deg_dir",
125
+ type=str,
126
+ help="Path to the reference audio folder.",
127
+ )
128
+ parser.add_argument(
129
+ "--dump_dir",
130
+ type=str,
131
+ help="Path to dump the results.",
132
+ )
133
+ parser.add_argument(
134
+ "--metrics",
135
+ nargs="+",
136
+ help="Metrics used to evaluate.",
137
+ )
138
+ args = parser.parse_args()
139
+
140
+ calc_metric(args.ref_dir, args.deg_dir, args.dump_dir, args.metrics)
bins/svc/inference.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import glob
9
+ from tqdm import tqdm
10
+ import json
11
+ import torch
12
+ import time
13
+
14
+ from models.svc.diffusion.diffusion_inference import DiffusionInference
15
+ from models.svc.comosvc.comosvc_inference import ComoSVCInference
16
+ from models.svc.transformer.transformer_inference import TransformerInference
17
+ from utils.util import load_config
18
+ from utils.audio_slicer import split_audio, merge_segments_encodec
19
+ from processors import acoustic_extractor, content_extractor
20
+
21
+
22
+ def build_inference(args, cfg, infer_type="from_dataset"):
23
+ supported_inference = {
24
+ "DiffWaveNetSVC": DiffusionInference,
25
+ "DiffComoSVC": ComoSVCInference,
26
+ "TransformerSVC": TransformerInference,
27
+ }
28
+
29
+ inference_class = supported_inference[cfg.model_type]
30
+ return inference_class(args, cfg, infer_type)
31
+
32
+
33
+ def prepare_for_audio_file(args, cfg, num_workers=1):
34
+ preprocess_path = cfg.preprocess.processed_dir
35
+ audio_name = cfg.inference.source_audio_name
36
+ temp_audio_dir = os.path.join(preprocess_path, audio_name)
37
+
38
+ ### eval file
39
+ t = time.time()
40
+ eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
41
+ args.source = eval_file
42
+ with open(eval_file, "r") as f:
43
+ metadata = json.load(f)
44
+ print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
45
+
46
+ ### acoustic features
47
+ t = time.time()
48
+ acoustic_extractor.extract_utt_acoustic_features_serial(
49
+ metadata, temp_audio_dir, cfg
50
+ )
51
+ acoustic_extractor.cal_mel_min_max(
52
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
53
+ )
54
+ acoustic_extractor.cal_pitch_statistics_svc(
55
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
56
+ )
57
+ print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
58
+
59
+ ### content features
60
+ t = time.time()
61
+ content_extractor.extract_utt_content_features_dataloader(
62
+ cfg, metadata, num_workers
63
+ )
64
+ print("Prepare for content features: {:.1f}s".format(time.time() - t))
65
+ return args, cfg, temp_audio_dir
66
+
67
+
68
+ def merge_for_audio_segments(audio_files, args, cfg):
69
+ audio_name = cfg.inference.source_audio_name
70
+ target_singer_name = args.target_singer
71
+
72
+ merge_segments_encodec(
73
+ wav_files=audio_files,
74
+ fs=cfg.preprocess.sample_rate,
75
+ output_path=os.path.join(
76
+ args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
77
+ ),
78
+ overlap_duration=cfg.inference.segments_overlap_duration,
79
+ )
80
+
81
+ for tmp_file in audio_files:
82
+ os.remove(tmp_file)
83
+
84
+
85
+ def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
86
+ """
87
+ Prepare the eval file (json) for an audio
88
+ """
89
+
90
+ audio_chunks_results = split_audio(
91
+ wav_file=cfg.inference.source_audio_path,
92
+ target_sr=cfg.preprocess.sample_rate,
93
+ output_dir=os.path.join(temp_audio_dir, "wavs"),
94
+ max_duration_of_segment=cfg.inference.segments_max_duration,
95
+ overlap_duration=cfg.inference.segments_overlap_duration,
96
+ )
97
+
98
+ metadata = []
99
+ for i, res in enumerate(audio_chunks_results):
100
+ res["index"] = i
101
+ res["Dataset"] = audio_name
102
+ res["Singer"] = audio_name
103
+ res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
104
+ metadata.append(res)
105
+
106
+ eval_file = os.path.join(temp_audio_dir, "eval.json")
107
+ with open(eval_file, "w") as f:
108
+ json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
109
+
110
+ return eval_file
111
+
112
+
113
+ def cuda_relevant(deterministic=False):
114
+ torch.cuda.empty_cache()
115
+ # TF32 on Ampere and above
116
+ torch.backends.cuda.matmul.allow_tf32 = True
117
+ torch.backends.cudnn.enabled = True
118
+ torch.backends.cudnn.allow_tf32 = True
119
+ # Deterministic
120
+ torch.backends.cudnn.deterministic = deterministic
121
+ torch.backends.cudnn.benchmark = not deterministic
122
+ torch.use_deterministic_algorithms(deterministic)
123
+
124
+
125
+ def infer(args, cfg, infer_type):
126
+ # Build inference
127
+ t = time.time()
128
+ trainer = build_inference(args, cfg, infer_type)
129
+ print("Model Init: {:.1f}s".format(time.time() - t))
130
+
131
+ # Run inference
132
+ t = time.time()
133
+ output_audio_files = trainer.inference()
134
+ print("Model inference: {:.1f}s".format(time.time() - t))
135
+ return output_audio_files
136
+
137
+
138
+ def build_parser():
139
+ r"""Build argument parser for inference.py.
140
+ Anything else should be put in an extra config YAML file.
141
+ """
142
+
143
+ parser = argparse.ArgumentParser()
144
+ parser.add_argument(
145
+ "--config",
146
+ type=str,
147
+ required=True,
148
+ help="JSON/YAML file for configurations.",
149
+ )
150
+ parser.add_argument(
151
+ "--acoustics_dir",
152
+ type=str,
153
+ help="Acoustics model checkpoint directory. If a directory is given, "
154
+ "search for the latest checkpoint dir in the directory. If a specific "
155
+ "checkpoint dir is given, directly load the checkpoint.",
156
+ )
157
+ parser.add_argument(
158
+ "--vocoder_dir",
159
+ type=str,
160
+ required=True,
161
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
162
+ "the acoustics one.",
163
+ )
164
+ parser.add_argument(
165
+ "--target_singer",
166
+ type=str,
167
+ required=True,
168
+ help="convert to a specific singer (e.g. --target_singers singer_id).",
169
+ )
170
+ parser.add_argument(
171
+ "--trans_key",
172
+ default=0,
173
+ help="0: no pitch shift; autoshift: pitch shift; int: key shift.",
174
+ )
175
+ parser.add_argument(
176
+ "--source",
177
+ type=str,
178
+ default="source_audio",
179
+ help="Source audio file or directory. If a JSON file is given, "
180
+ "inference from dataset is applied. If a directory is given, "
181
+ "inference from all wav/flac/mp3 audio files in the directory is applied. "
182
+ "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
183
+ )
184
+ parser.add_argument(
185
+ "--output_dir",
186
+ type=str,
187
+ default="conversion_results",
188
+ help="Output directory. Default: ./conversion_results",
189
+ )
190
+ parser.add_argument(
191
+ "--log_level",
192
+ type=str,
193
+ default="warning",
194
+ help="Logging level. Default: warning",
195
+ )
196
+ parser.add_argument(
197
+ "--keep_cache",
198
+ action="store_true",
199
+ default=True,
200
+ help="Keep cache files. Only applicable to inference from files.",
201
+ )
202
+ parser.add_argument(
203
+ "--diffusion_inference_steps",
204
+ type=int,
205
+ default=50,
206
+ help="Number of inference steps. Only applicable to diffusion inference.",
207
+ )
208
+ return parser
209
+
210
+
211
+ def main():
212
+ ### Parse arguments and config
213
+ args = build_parser().parse_args()
214
+ cfg = load_config(args.config)
215
+
216
+ # CUDA settings
217
+ cuda_relevant()
218
+
219
+ if os.path.isdir(args.source):
220
+ ### Infer from file
221
+
222
+ # Get all the source audio files (.wav, .flac, .mp3)
223
+ source_audio_dir = args.source
224
+ audio_list = []
225
+ for suffix in ["wav", "flac", "mp3"]:
226
+ audio_list += glob.glob(
227
+ os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
228
+ )
229
+ print("There are {} source audios: ".format(len(audio_list)))
230
+
231
+ # Infer for every file as dataset
232
+ output_root_path = args.output_dir
233
+ for audio_path in tqdm(audio_list):
234
+ audio_name = audio_path.split("/")[-1].split(".")[0]
235
+ args.output_dir = os.path.join(output_root_path, audio_name)
236
+ print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
237
+
238
+ cfg.inference.source_audio_path = audio_path
239
+ cfg.inference.source_audio_name = audio_name
240
+ cfg.inference.segments_max_duration = 10.0
241
+ cfg.inference.segments_overlap_duration = 1.0
242
+
243
+ # Prepare metadata and features
244
+ args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
245
+
246
+ # Infer from file
247
+ output_audio_files = infer(args, cfg, infer_type="from_file")
248
+
249
+ # Merge the split segments
250
+ merge_for_audio_segments(output_audio_files, args, cfg)
251
+
252
+ # Keep or remove caches
253
+ if not args.keep_cache:
254
+ os.removedirs(cache_dir)
255
+
256
+ else:
257
+ ### Infer from dataset
258
+ infer(args, cfg, infer_type="from_dataset")
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
bins/svc/preprocess.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ from multiprocessing import cpu_count
14
+
15
+
16
+ from utils.util import load_config
17
+ from preprocessors.processor import preprocess_dataset
18
+ from preprocessors.metadata import cal_metadata
19
+ from processors import acoustic_extractor, content_extractor, data_augment
20
+
21
+
22
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
23
+ """Extract acoustic features of utterances in the dataset
24
+
25
+ Args:
26
+ dataset (str): name of dataset, e.g. opencpop
27
+ output_path (str): directory that stores train, test and feature files of datasets
28
+ cfg (dict): dictionary that stores configurations
29
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
30
+ """
31
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
32
+ metadata = []
33
+ dataset_output = os.path.join(output_path, dataset)
34
+
35
+ for dataset_type in types:
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+ preprocess_dataset(
84
+ dataset,
85
+ cfg.dataset_path[dataset],
86
+ output_path,
87
+ cfg.preprocess,
88
+ is_custom_dataset=cfg.use_custom_dataset,
89
+ )
90
+
91
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
92
+ try:
93
+ assert isinstance(
94
+ cfg.preprocess.data_augment, list
95
+ ), "Please provide a list of datasets need to be augmented."
96
+ if len(cfg.preprocess.data_augment) > 0:
97
+ new_datasets_list = []
98
+ for dataset in cfg.preprocess.data_augment:
99
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
100
+ new_datasets_list.extend(new_datasets)
101
+ cfg.dataset.extend(new_datasets_list)
102
+ print("Augmentation datasets: ", cfg.dataset)
103
+ except:
104
+ print("No Data Augmentation.")
105
+
106
+ # Dump metadata of datasets (singers, train/test durations, etc.)
107
+ cal_metadata(cfg)
108
+
109
+ ## Prepare the acoustic features
110
+ for dataset in cfg.dataset:
111
+ # Skip augmented datasets which do not need to extract acoustic features
112
+ # We will copy acoustic features from the original dataset later
113
+ if (
114
+ "pitch_shift" in dataset
115
+ or "formant_shift" in dataset
116
+ or "equalizer" in dataset in dataset
117
+ ):
118
+ continue
119
+ print(
120
+ "Extracting acoustic features for {} using {} workers ...".format(
121
+ dataset, args.num_workers
122
+ )
123
+ )
124
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
125
+ # Calculate the statistics of acoustic features
126
+ if cfg.preprocess.mel_min_max_norm:
127
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
128
+
129
+ if cfg.preprocess.extract_pitch:
130
+ acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
131
+
132
+ # Copy acoustic features for augmented datasets by creating soft-links
133
+ for dataset in cfg.dataset:
134
+ if "pitch_shift" in dataset:
135
+ src_dataset = dataset.replace("_pitch_shift", "")
136
+ src_dataset_dir = os.path.join(output_path, src_dataset)
137
+ elif "formant_shift" in dataset:
138
+ src_dataset = dataset.replace("_formant_shift", "")
139
+ src_dataset_dir = os.path.join(output_path, src_dataset)
140
+ elif "equalizer" in dataset:
141
+ src_dataset = dataset.replace("_equalizer", "")
142
+ src_dataset_dir = os.path.join(output_path, src_dataset)
143
+ else:
144
+ continue
145
+ dataset_dir = os.path.join(output_path, dataset)
146
+ metadata = []
147
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
148
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
149
+ with open(metadata_file_path, "r") as f:
150
+ metadata.extend(json.load(f))
151
+ print("Copying acoustic features for {}...".format(dataset))
152
+ acoustic_extractor.copy_acoustic_features(
153
+ metadata, dataset_dir, src_dataset_dir, cfg
154
+ )
155
+ if cfg.preprocess.mel_min_max_norm:
156
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
157
+
158
+ if cfg.preprocess.extract_pitch:
159
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
160
+
161
+ # Prepare the content features
162
+ for dataset in cfg.dataset:
163
+ print("Extracting content features for {}...".format(dataset))
164
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
165
+
166
+
167
+ def main():
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument(
170
+ "--config", default="config.json", help="json files for configurations."
171
+ )
172
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
173
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
174
+
175
+ args = parser.parse_args()
176
+ cfg = load_config(args.config)
177
+
178
+ preprocess(cfg, args)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
bins/svc/train.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
11
+ from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
12
+ from models.svc.transformer.transformer_trainer import TransformerTrainer
13
+ from utils.util import load_config
14
+
15
+
16
+ def build_trainer(args, cfg):
17
+ supported_trainer = {
18
+ "DiffWaveNetSVC": DiffusionTrainer,
19
+ "DiffComoSVC": ComoSVCTrainer,
20
+ "TransformerSVC": TransformerTrainer,
21
+ }
22
+
23
+ trainer_class = supported_trainer[cfg.model_type]
24
+ trainer = trainer_class(args, cfg)
25
+ return trainer
26
+
27
+
28
+ def cuda_relevant(deterministic=False):
29
+ torch.cuda.empty_cache()
30
+ # TF32 on Ampere and above
31
+ torch.backends.cuda.matmul.allow_tf32 = True
32
+ torch.backends.cudnn.enabled = True
33
+ torch.backends.cudnn.allow_tf32 = True
34
+ # Deterministic
35
+ torch.backends.cudnn.deterministic = deterministic
36
+ torch.backends.cudnn.benchmark = not deterministic
37
+ torch.use_deterministic_algorithms(deterministic)
38
+
39
+
40
+ def main():
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument(
43
+ "--config",
44
+ default="config.json",
45
+ help="json files for configurations.",
46
+ required=True,
47
+ )
48
+ parser.add_argument(
49
+ "--exp_name",
50
+ type=str,
51
+ default="exp_name",
52
+ help="A specific name to note the experiment",
53
+ required=True,
54
+ )
55
+ parser.add_argument(
56
+ "--resume",
57
+ action="store_true",
58
+ help="If specified, to resume from the existing checkpoint.",
59
+ )
60
+ parser.add_argument(
61
+ "--resume_from_ckpt_path",
62
+ type=str,
63
+ default="",
64
+ help="The specific checkpoint path that you want to resume from.",
65
+ )
66
+ parser.add_argument(
67
+ "--resume_type",
68
+ type=str,
69
+ default="",
70
+ help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
71
+ )
72
+
73
+ parser.add_argument(
74
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
75
+ )
76
+ args = parser.parse_args()
77
+ cfg = load_config(args.config)
78
+
79
+ # Data Augmentation
80
+ if (
81
+ type(cfg.preprocess.data_augment) == list
82
+ and len(cfg.preprocess.data_augment) > 0
83
+ ):
84
+ new_datasets_list = []
85
+ for dataset in cfg.preprocess.data_augment:
86
+ new_datasets = [
87
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
88
+ f"{dataset}_formant_shift"
89
+ if cfg.preprocess.use_formant_shift
90
+ else None,
91
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
92
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
93
+ ]
94
+ new_datasets_list.extend(filter(None, new_datasets))
95
+ cfg.dataset.extend(new_datasets_list)
96
+
97
+ # CUDA settings
98
+ cuda_relevant()
99
+
100
+ # Build trainer
101
+ trainer = build_trainer(args, cfg)
102
+
103
+ trainer.train_loop()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
bins/tta/inference.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tta.ldm.audioldm_inference import AudioLDMInference
11
+ from utils.util import save_config, load_model_config, load_config
12
+ import numpy as np
13
+ import torch
14
+
15
+
16
+ def build_inference(args, cfg):
17
+ supported_inference = {
18
+ "AudioLDM": AudioLDMInference,
19
+ }
20
+
21
+ inference_class = supported_inference[cfg.model_type]
22
+ inference = inference_class(args, cfg)
23
+ return inference
24
+
25
+
26
+ def build_parser():
27
+ parser = argparse.ArgumentParser()
28
+
29
+ parser.add_argument(
30
+ "--config",
31
+ type=str,
32
+ required=True,
33
+ help="JSON/YAML file for configurations.",
34
+ )
35
+ parser.add_argument(
36
+ "--text",
37
+ help="Text to be synthesized",
38
+ type=str,
39
+ default="Text to be synthesized.",
40
+ )
41
+ parser.add_argument(
42
+ "--checkpoint_path",
43
+ type=str,
44
+ )
45
+ parser.add_argument(
46
+ "--vocoder_path", type=str, help="Checkpoint path of the vocoder"
47
+ )
48
+ parser.add_argument(
49
+ "--vocoder_config_path", type=str, help="Config path of the vocoder"
50
+ )
51
+ parser.add_argument(
52
+ "--output_dir",
53
+ type=str,
54
+ default=None,
55
+ help="Output dir for saving generated results",
56
+ )
57
+ parser.add_argument(
58
+ "--num_steps",
59
+ type=int,
60
+ default=200,
61
+ help="The total number of denosing steps",
62
+ )
63
+ parser.add_argument(
64
+ "--guidance_scale",
65
+ type=float,
66
+ default=4.0,
67
+ help="The scale of classifer free guidance",
68
+ )
69
+ parser.add_argument("--local_rank", default=-1, type=int)
70
+ return parser
71
+
72
+
73
+ def main():
74
+ # Parse arguments
75
+ args = build_parser().parse_args()
76
+ # args, infer_type = formulate_parser(args)
77
+
78
+ # Parse config
79
+ cfg = load_config(args.config)
80
+ if torch.cuda.is_available():
81
+ args.local_rank = torch.device("cuda")
82
+ else:
83
+ args.local_rank = torch.device("cpu")
84
+ print("args: ", args)
85
+
86
+ # Build inference
87
+ inferencer = build_inference(args, cfg)
88
+
89
+ # Run inference
90
+ inferencer.inference()
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
bins/tta/preprocess.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+
84
+ if args.prepare_alignment:
85
+ ## Prepare alignment with MFA
86
+ print("Prepare alignment {}...".format(dataset))
87
+ prepare_align(
88
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
89
+ )
90
+ preprocess_dataset(
91
+ dataset,
92
+ cfg.dataset_path[dataset],
93
+ output_path,
94
+ cfg.preprocess,
95
+ is_custom_dataset=cfg.use_custom_dataset,
96
+ )
97
+
98
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
99
+ try:
100
+ assert isinstance(
101
+ cfg.preprocess.data_augment, list
102
+ ), "Please provide a list of datasets need to be augmented."
103
+ if len(cfg.preprocess.data_augment) > 0:
104
+ new_datasets_list = []
105
+ for dataset in cfg.preprocess.data_augment:
106
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
107
+ new_datasets_list.extend(new_datasets)
108
+ cfg.dataset.extend(new_datasets_list)
109
+ print("Augmentation datasets: ", cfg.dataset)
110
+ except:
111
+ print("No Data Augmentation.")
112
+
113
+ # Dump metadata of datasets (singers, train/test durations, etc.)
114
+ cal_metadata(cfg)
115
+
116
+ ## Prepare the acoustic features
117
+ for dataset in cfg.dataset:
118
+ # Skip augmented datasets which do not need to extract acoustic features
119
+ # We will copy acoustic features from the original dataset later
120
+ if (
121
+ "pitch_shift" in dataset
122
+ or "formant_shift" in dataset
123
+ or "equalizer" in dataset in dataset
124
+ ):
125
+ continue
126
+ print(
127
+ "Extracting acoustic features for {} using {} workers ...".format(
128
+ dataset, args.num_workers
129
+ )
130
+ )
131
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
132
+ # Calculate the statistics of acoustic features
133
+ if cfg.preprocess.mel_min_max_norm:
134
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
135
+
136
+ if cfg.preprocess.extract_pitch:
137
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
138
+ if cfg.preprocess.extract_energy:
139
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
140
+
141
+ if cfg.preprocess.align_mel_duration:
142
+ acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
143
+
144
+ # Copy acoustic features for augmented datasets by creating soft-links
145
+ for dataset in cfg.dataset:
146
+ if "pitch_shift" in dataset:
147
+ src_dataset = dataset.replace("_pitch_shift", "")
148
+ src_dataset_dir = os.path.join(output_path, src_dataset)
149
+ elif "formant_shift" in dataset:
150
+ src_dataset = dataset.replace("_formant_shift", "")
151
+ src_dataset_dir = os.path.join(output_path, src_dataset)
152
+ elif "equalizer" in dataset:
153
+ src_dataset = dataset.replace("_equalizer", "")
154
+ src_dataset_dir = os.path.join(output_path, src_dataset)
155
+ else:
156
+ continue
157
+ dataset_dir = os.path.join(output_path, dataset)
158
+ metadata = []
159
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
160
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
161
+ with open(metadata_file_path, "r") as f:
162
+ metadata.extend(json.load(f))
163
+ print("Copying acoustic features for {}...".format(dataset))
164
+ acoustic_extractor.copy_acoustic_features(
165
+ metadata, dataset_dir, src_dataset_dir, cfg
166
+ )
167
+ if cfg.preprocess.mel_min_max_norm:
168
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
169
+
170
+ if cfg.preprocess.extract_pitch:
171
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
172
+
173
+ # Prepare the content features
174
+ for dataset in cfg.dataset:
175
+ print("Extracting content features for {}...".format(dataset))
176
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
177
+
178
+
179
+ def main():
180
+ parser = argparse.ArgumentParser()
181
+ parser.add_argument(
182
+ "--config", default="config.json", help="json files for configurations."
183
+ )
184
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
185
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
186
+
187
+ args = parser.parse_args()
188
+ cfg = load_config(args.config)
189
+
190
+ preprocess(cfg, args)
191
+
192
+
193
+ if __name__ == "__main__":
194
+ main()
bins/tta/train_tta.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import torch
9
+
10
+ from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
11
+ from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_trainer(args, cfg):
16
+ supported_trainer = {
17
+ "AutoencoderKL": AutoencoderKLTrainer,
18
+ "AudioLDM": AudioLDMTrainer,
19
+ }
20
+
21
+ trainer_class = supported_trainer[cfg.model_type]
22
+ trainer = trainer_class(args, cfg)
23
+ return trainer
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--config",
30
+ default="config.json",
31
+ help="json files for configurations.",
32
+ required=True,
33
+ )
34
+ parser.add_argument(
35
+ "--num_workers", type=int, default=6, help="Number of dataloader workers."
36
+ )
37
+ parser.add_argument(
38
+ "--exp_name",
39
+ type=str,
40
+ default="exp_name",
41
+ help="A specific name to note the experiment",
42
+ required=True,
43
+ )
44
+ parser.add_argument(
45
+ "--resume",
46
+ type=str,
47
+ default=None,
48
+ # action="store_true",
49
+ help="The model name to restore",
50
+ )
51
+ parser.add_argument(
52
+ "--log_level", default="info", help="logging level (info, debug, warning)"
53
+ )
54
+ parser.add_argument("--stdout_interval", default=5, type=int)
55
+ parser.add_argument("--local_rank", default=-1, type=int)
56
+ args = parser.parse_args()
57
+ cfg = load_config(args.config)
58
+ cfg.exp_name = args.exp_name
59
+
60
+ # Model saving dir
61
+ args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
62
+ os.makedirs(args.log_dir, exist_ok=True)
63
+
64
+ if not cfg.train.ddp:
65
+ args.local_rank = torch.device("cuda")
66
+
67
+ # Build trainer
68
+ trainer = build_trainer(args, cfg)
69
+
70
+ # Restore models
71
+ if args.resume:
72
+ trainer.restore()
73
+ trainer.train()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
bins/tts/inference.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
11
+ from models.tts.vits.vits_inference import VitsInference
12
+ from models.tts.valle.valle_inference import VALLEInference
13
+ from utils.util import load_config
14
+ import torch
15
+
16
+
17
+ def build_inference(args, cfg):
18
+ supported_inference = {
19
+ "FastSpeech2": FastSpeech2Inference,
20
+ "VITS": VitsInference,
21
+ "VALLE": VALLEInference,
22
+ }
23
+
24
+ inference_class = supported_inference[cfg.model_type]
25
+ inference = inference_class(args, cfg)
26
+ return inference
27
+
28
+
29
+ def cuda_relevant(deterministic=False):
30
+ torch.cuda.empty_cache()
31
+ # TF32 on Ampere and above
32
+ torch.backends.cuda.matmul.allow_tf32 = True
33
+ torch.backends.cudnn.enabled = True
34
+ torch.backends.cudnn.allow_tf32 = True
35
+ # Deterministic
36
+ torch.backends.cudnn.deterministic = deterministic
37
+ torch.backends.cudnn.benchmark = not deterministic
38
+ torch.use_deterministic_algorithms(deterministic)
39
+
40
+
41
+ def build_parser():
42
+ parser = argparse.ArgumentParser()
43
+
44
+ parser.add_argument(
45
+ "--config",
46
+ type=str,
47
+ required=True,
48
+ help="JSON/YAML file for configurations.",
49
+ )
50
+ parser.add_argument(
51
+ "--dataset",
52
+ type=str,
53
+ help="convert from the source data",
54
+ default=None,
55
+ )
56
+ parser.add_argument(
57
+ "--testing_set",
58
+ type=str,
59
+ help="train, test, golden_test",
60
+ default="test",
61
+ )
62
+ parser.add_argument(
63
+ "--test_list_file",
64
+ type=str,
65
+ help="convert from the test list file",
66
+ default=None,
67
+ )
68
+ parser.add_argument(
69
+ "--speaker_name",
70
+ type=str,
71
+ default=None,
72
+ help="speaker name for multi-speaker synthesis, for single-sentence mode only",
73
+ )
74
+ parser.add_argument(
75
+ "--text",
76
+ help="Text to be synthesized.",
77
+ type=str,
78
+ default="",
79
+ )
80
+ parser.add_argument(
81
+ "--vocoder_dir",
82
+ type=str,
83
+ default=None,
84
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
85
+ "the acoustics one.",
86
+ )
87
+ parser.add_argument(
88
+ "--acoustics_dir",
89
+ type=str,
90
+ default=None,
91
+ help="Acoustic model checkpoint directory. If a directory is given, "
92
+ "search for the latest checkpoint dir in the directory. If a specific "
93
+ "checkpoint dir is given, directly load the checkpoint.",
94
+ )
95
+ parser.add_argument(
96
+ "--checkpoint_path",
97
+ type=str,
98
+ default=None,
99
+ help="Acoustic model checkpoint directory. If a directory is given, "
100
+ "search for the latest checkpoint dir in the directory. If a specific "
101
+ "checkpoint dir is given, directly load the checkpoint.",
102
+ )
103
+ parser.add_argument(
104
+ "--mode",
105
+ type=str,
106
+ choices=["batch", "single"],
107
+ required=True,
108
+ help="Synthesize a whole dataset or a single sentence",
109
+ )
110
+ parser.add_argument(
111
+ "--log_level",
112
+ type=str,
113
+ default="warning",
114
+ help="Logging level. Default: warning",
115
+ )
116
+ parser.add_argument(
117
+ "--pitch_control",
118
+ type=float,
119
+ default=1.0,
120
+ help="control the pitch of the whole utterance, larger value for higher pitch",
121
+ )
122
+ parser.add_argument(
123
+ "--energy_control",
124
+ type=float,
125
+ default=1.0,
126
+ help="control the energy of the whole utterance, larger value for larger volume",
127
+ )
128
+ parser.add_argument(
129
+ "--duration_control",
130
+ type=float,
131
+ default=1.0,
132
+ help="control the speed of the whole utterance, larger value for slower speaking rate",
133
+ )
134
+ parser.add_argument(
135
+ "--output_dir",
136
+ type=str,
137
+ default=None,
138
+ help="Output dir for saving generated results",
139
+ )
140
+ return parser
141
+
142
+ def main():
143
+ # Parse arguments
144
+ parser = build_parser()
145
+ VALLEInference.add_arguments(parser)
146
+ args = parser.parse_args()
147
+
148
+ # Parse config
149
+ cfg = load_config(args.config)
150
+
151
+ # CUDA settings
152
+ cuda_relevant()
153
+
154
+ # Build inference
155
+ inferencer = build_inference(args, cfg)
156
+
157
+ # Run inference
158
+ inferencer.inference()
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()
bins/tts/preprocess.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+ faulthandler.enable()
8
+
9
+ import os
10
+ import argparse
11
+ import json
12
+ import pyworld as pw
13
+ from multiprocessing import cpu_count
14
+
15
+
16
+ from utils.util import load_config
17
+ from preprocessors.processor import preprocess_dataset, prepare_align
18
+ from preprocessors.metadata import cal_metadata
19
+ from processors import acoustic_extractor, content_extractor, data_augment, phone_extractor
20
+
21
+
22
+ def extract_acoustic_features(dataset, output_path, cfg, num_workers=1):
23
+ """Extract acoustic features of utterances in the dataset
24
+
25
+ Args:
26
+ dataset (str): name of dataset, e.g. opencpop
27
+ output_path (str): directory that stores train, test and feature files of datasets
28
+ cfg (dict): dictionary that stores configurations
29
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
30
+ """
31
+ # types = ["train", "test"] if "eval" not in dataset else ["test"]
32
+ types = list()
33
+ types.append((cfg.preprocess.train_file).split('.')[0])
34
+ types.append((cfg.preprocess.valid_file).split('.')[0])
35
+ if 'test' not in types:
36
+ types.append('test')
37
+ if "eval" in dataset:
38
+ types = ["test"]
39
+ print('types: ', types)
40
+ metadata = []
41
+ for dataset_type in types:
42
+ dataset_output = os.path.join(output_path, dataset)
43
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
44
+ with open(dataset_file, "r") as f:
45
+ metadata.extend(json.load(f))
46
+
47
+ if num_workers > 1:
48
+ acoustic_extractor.extract_utt_acoustic_features_parallel(
49
+ metadata, dataset_output, cfg, num_workers=num_workers
50
+ )
51
+ else:
52
+ acoustic_extractor.extract_utt_acoustic_features_serial(
53
+ metadata, dataset_output, cfg
54
+ )
55
+
56
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
57
+ """Extract content features of utterances in the dataset
58
+
59
+ Args:
60
+ dataset (str): name of dataset, e.g. opencpop
61
+ output_path (str): directory that stores train, test and feature files of datasets
62
+ cfg (dict): dictionary that stores configurations
63
+ """
64
+ # types = ["train", "test"] if "eval" not in dataset else ["test"]
65
+
66
+ types = list()
67
+ types.append((cfg.preprocess.train_file).split('.')[0])
68
+ types.append((cfg.preprocess.valid_file).split('.')[0])
69
+ if 'test' not in types:
70
+ types.append('test')
71
+ if "eval" in dataset:
72
+ types = ["test"]
73
+
74
+ metadata = []
75
+ for dataset_type in types:
76
+ dataset_output = os.path.join(output_path, dataset)
77
+ # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
78
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
79
+ with open(dataset_file, "r") as f:
80
+ metadata.extend(json.load(f))
81
+
82
+ content_extractor.extract_utt_content_features_dataloader(
83
+ cfg, metadata, num_workers
84
+ )
85
+
86
+ def extract_phonme_sequences(dataset, output_path, cfg):
87
+ """Extract phoneme features of utterances in the dataset
88
+
89
+ Args:
90
+ dataset (str): name of dataset, e.g. opencpop
91
+ output_path (str): directory that stores train, test and feature files of datasets
92
+ cfg (dict): dictionary that stores configurations
93
+
94
+ """
95
+ # types = ["train", "test"] if "eval" not in dataset else ["test"]
96
+
97
+ types = list()
98
+ types.append((cfg.preprocess.train_file).split('.')[0])
99
+ types.append((cfg.preprocess.valid_file).split('.')[0])
100
+ if 'test' not in types:
101
+ types.append('test')
102
+ if "eval" in dataset:
103
+ types = ["test"]
104
+
105
+ metadata = []
106
+ for dataset_type in types:
107
+ dataset_output = os.path.join(output_path, dataset)
108
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
109
+ with open(dataset_file, "r") as f:
110
+ metadata.extend(json.load(f))
111
+ phone_extractor.extract_utt_phone_sequence(
112
+ cfg, metadata
113
+ )
114
+
115
+
116
+ def preprocess(cfg, args):
117
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
118
+
119
+ Args:
120
+ cfg (dict): dictionary that stores configurations
121
+ args (ArgumentParser): specify the configuration file and num_workers
122
+ """
123
+ # Specify the output root path to save the processed data
124
+ output_path = cfg.preprocess.processed_dir
125
+ os.makedirs(output_path, exist_ok=True)
126
+
127
+ '''
128
+
129
+ ## Split train and test sets
130
+ for dataset in cfg.dataset:
131
+ print("Preprocess {}...".format(dataset))
132
+
133
+ if args.prepare_alignment:
134
+ ## Prepare alignment with MFA
135
+ print("Prepare alignment {}...".format(dataset))
136
+ prepare_align(
137
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
138
+ )
139
+
140
+ preprocess_dataset(
141
+ dataset,
142
+ cfg.dataset_path[dataset],
143
+ output_path,
144
+ cfg.preprocess,
145
+ is_custom_dataset=cfg.use_custom_dataset,
146
+ )
147
+
148
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
149
+ try:
150
+ assert isinstance(
151
+ cfg.preprocess.data_augment, list
152
+ ), "Please provide a list of datasets need to be augmented."
153
+ if len(cfg.preprocess.data_augment) > 0:
154
+ new_datasets_list = []
155
+ for dataset in cfg.preprocess.data_augment:
156
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
157
+ new_datasets_list.extend(new_datasets)
158
+ cfg.dataset.extend(new_datasets_list)
159
+ print("Augmentation datasets: ", cfg.dataset)
160
+ except:
161
+ print("No Data Augmentation.")
162
+
163
+ # Dump metadata of datasets (singers, train/test durations, etc.)
164
+ cal_metadata(cfg)
165
+ '''
166
+ ## Prepare the acoustic features
167
+ for dataset in cfg.dataset:
168
+ # Skip augmented datasets which do not need to extract acoustic features
169
+ # We will copy acoustic features from the original dataset later
170
+ if (
171
+ "pitch_shift" in dataset
172
+ or "formant_shift" in dataset
173
+ or "equalizer" in dataset in dataset
174
+ ):
175
+ continue
176
+ print(
177
+ "Extracting acoustic features for {} using {} workers ...".format(
178
+ dataset, args.num_workers
179
+ )
180
+ )
181
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
182
+ # Calculate the statistics of acoustic features
183
+ if cfg.preprocess.mel_min_max_norm:
184
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
185
+
186
+ if cfg.preprocess.extract_pitch:
187
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
188
+
189
+ if cfg.preprocess.extract_energy:
190
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
191
+
192
+ if cfg.preprocess.pitch_norm:
193
+ acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
194
+
195
+ if cfg.preprocess.energy_norm:
196
+ acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
197
+
198
+ # Copy acoustic features for augmented datasets by creating soft-links
199
+ for dataset in cfg.dataset:
200
+ if "pitch_shift" in dataset:
201
+ src_dataset = dataset.replace("_pitch_shift", "")
202
+ src_dataset_dir = os.path.join(output_path, src_dataset)
203
+ elif "formant_shift" in dataset:
204
+ src_dataset = dataset.replace("_formant_shift", "")
205
+ src_dataset_dir = os.path.join(output_path, src_dataset)
206
+ elif "equalizer" in dataset:
207
+ src_dataset = dataset.replace("_equalizer", "")
208
+ src_dataset_dir = os.path.join(output_path, src_dataset)
209
+ else:
210
+ continue
211
+ dataset_dir = os.path.join(output_path, dataset)
212
+ metadata = []
213
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
214
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
215
+ with open(metadata_file_path, "r") as f:
216
+ metadata.extend(json.load(f))
217
+ print("Copying acoustic features for {}...".format(dataset))
218
+ acoustic_extractor.copy_acoustic_features(
219
+ metadata, dataset_dir, src_dataset_dir, cfg
220
+ )
221
+ if cfg.preprocess.mel_min_max_norm:
222
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
223
+
224
+ if cfg.preprocess.extract_pitch:
225
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
226
+
227
+ # Prepare the content features
228
+ for dataset in cfg.dataset:
229
+ print("Extracting content features for {}...".format(dataset))
230
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
231
+
232
+ # Prepare the phenome squences
233
+ if cfg.preprocess.extract_phone:
234
+ for dataset in cfg.dataset:
235
+ print("Extracting phoneme sequence for {}...".format(dataset))
236
+ extract_phonme_sequences(dataset, output_path, cfg)
237
+
238
+ def main():
239
+ parser = argparse.ArgumentParser()
240
+ parser.add_argument(
241
+ "--config", default="config.json", help="json files for configurations."
242
+ )
243
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
244
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
245
+
246
+ args = parser.parse_args()
247
+ cfg = load_config(args.config)
248
+
249
+ preprocess(cfg, args)
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
bins/tts/train.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
11
+ from models.tts.vits.vits_trainer import VITSTrainer
12
+ from models.tts.valle.valle_trainer import VALLETrainer
13
+ from utils.util import load_config
14
+
15
+
16
+ def build_trainer(args, cfg):
17
+ supported_trainer = {
18
+ "FastSpeech2": FastSpeech2Trainer,
19
+ "VITS": VITSTrainer,
20
+ "VALLE": VALLETrainer,
21
+ }
22
+
23
+ trainer_class = supported_trainer[cfg.model_type]
24
+ trainer = trainer_class(args, cfg)
25
+ return trainer
26
+
27
+
28
+ def cuda_relevant(deterministic=False):
29
+ torch.cuda.empty_cache()
30
+ # TF32 on Ampere and above
31
+ torch.backends.cuda.matmul.allow_tf32 = True
32
+ torch.backends.cudnn.enabled = True
33
+ torch.backends.cudnn.allow_tf32 = True
34
+ # Deterministic
35
+ torch.backends.cudnn.deterministic = deterministic
36
+ torch.backends.cudnn.benchmark = not deterministic
37
+ torch.use_deterministic_algorithms(deterministic)
38
+
39
+
40
+ def main():
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument(
43
+ "--config",
44
+ default="config.json",
45
+ help="json files for configurations.",
46
+ required=True,
47
+ )
48
+ parser.add_argument(
49
+ "--exp_name",
50
+ type=str,
51
+ default="exp_name",
52
+ help="A specific name to note the experiment",
53
+ required=True,
54
+ )
55
+ parser.add_argument(
56
+ "--resume", action="store_true", help="The model name to restore"
57
+ )
58
+ parser.add_argument(
59
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
60
+ )
61
+ parser.add_argument(
62
+ "--resume_type",
63
+ type=str,
64
+ default="resume",
65
+ help="Resume training or finetuning.",
66
+ )
67
+ parser.add_argument(
68
+ "--checkpoint_path",
69
+ type=str,
70
+ default=None,
71
+ help="Checkpoint for resume training or finetuning.",
72
+ )
73
+
74
+ VALLETrainer.add_arguments(parser)
75
+ args = parser.parse_args()
76
+ cfg = load_config(args.config)
77
+
78
+ # Data Augmentation
79
+ if (
80
+ type(cfg.preprocess.data_augment) == list
81
+ and len(cfg.preprocess.data_augment) > 0
82
+ ):
83
+ new_datasets_list = []
84
+ for dataset in cfg.preprocess.data_augment:
85
+ new_datasets = [
86
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
87
+ f"{dataset}_formant_shift"
88
+ if cfg.preprocess.use_formant_shift
89
+ else None,
90
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
91
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
92
+ ]
93
+ new_datasets_list.extend(filter(None, new_datasets))
94
+ cfg.dataset.extend(new_datasets_list)
95
+
96
+ # # CUDA settings
97
+ cuda_relevant()
98
+
99
+ # Build trainer
100
+ trainer = build_trainer(args, cfg)
101
+ torch.set_num_threads(1)
102
+ torch.set_num_interop_threads(1)
103
+ trainer.train_loop()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
bins/vocoder/inference.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+
9
+ import torch
10
+
11
+ from models.vocoders.vocoder_inference import VocoderInference
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_inference(args, cfg, infer_type="infer_from_dataset"):
16
+ supported_inference = {
17
+ "GANVocoder": VocoderInference,
18
+ }
19
+
20
+ inference_class = supported_inference[cfg.model_type]
21
+ return inference_class(args, cfg, infer_type)
22
+
23
+
24
+ def cuda_relevant(deterministic=False):
25
+ torch.cuda.empty_cache()
26
+ # TF32 on Ampere and above
27
+ torch.backends.cuda.matmul.allow_tf32 = True
28
+ torch.backends.cudnn.enabled = True
29
+ torch.backends.cudnn.allow_tf32 = True
30
+ # Deterministic
31
+ torch.backends.cudnn.deterministic = deterministic
32
+ torch.backends.cudnn.benchmark = not deterministic
33
+ torch.use_deterministic_algorithms(deterministic)
34
+
35
+
36
+ def build_parser():
37
+ r"""Build argument parser for inference.py.
38
+ Anything else should be put in an extra config YAML file.
39
+ """
40
+
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument(
43
+ "--config",
44
+ type=str,
45
+ required=True,
46
+ help="JSON/YAML file for configurations.",
47
+ )
48
+ parser.add_argument(
49
+ "--infer_mode",
50
+ type=str,
51
+ required=None,
52
+ )
53
+ parser.add_argument(
54
+ "--infer_datasets",
55
+ nargs="+",
56
+ default=None,
57
+ )
58
+ parser.add_argument(
59
+ "--feature_folder",
60
+ type=str,
61
+ default=None,
62
+ )
63
+ parser.add_argument(
64
+ "--audio_folder",
65
+ type=str,
66
+ default=None,
67
+ )
68
+ parser.add_argument(
69
+ "--vocoder_dir",
70
+ type=str,
71
+ required=True,
72
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
73
+ "the acoustics one.",
74
+ )
75
+ parser.add_argument(
76
+ "--output_dir",
77
+ type=str,
78
+ default="result",
79
+ help="Output directory. Default: ./result",
80
+ )
81
+ parser.add_argument(
82
+ "--log_level",
83
+ type=str,
84
+ default="warning",
85
+ help="Logging level. Default: warning",
86
+ )
87
+ parser.add_argument(
88
+ "--keep_cache",
89
+ action="store_true",
90
+ default=False,
91
+ help="Keep cache files. Only applicable to inference from files.",
92
+ )
93
+ return parser
94
+
95
+
96
+ def main():
97
+ # Parse arguments
98
+ args = build_parser().parse_args()
99
+
100
+ # Parse config
101
+ cfg = load_config(args.config)
102
+
103
+ # CUDA settings
104
+ cuda_relevant()
105
+
106
+ # Build inference
107
+ trainer = build_inference(args, cfg, args.infer_mode)
108
+
109
+ # Run inference
110
+ trainer.inference()
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
bins/vocoder/preprocess.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ acoustic_extractor.extract_utt_acoustic_features_serial(
41
+ metadata, dataset_output, cfg
42
+ )
43
+
44
+
45
+ def preprocess(cfg, args):
46
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
47
+
48
+ Args:
49
+ cfg (dict): dictionary that stores configurations
50
+ args (ArgumentParser): specify the configuration file and num_workers
51
+ """
52
+ # Specify the output root path to save the processed data
53
+ output_path = cfg.preprocess.processed_dir
54
+ os.makedirs(output_path, exist_ok=True)
55
+
56
+ ## Split train and test sets
57
+ for dataset in cfg.dataset:
58
+ print("Preprocess {}...".format(dataset))
59
+
60
+ preprocess_dataset(
61
+ dataset,
62
+ cfg.dataset_path[dataset],
63
+ output_path,
64
+ cfg.preprocess,
65
+ is_custom_dataset=cfg.use_custom_dataset,
66
+ )
67
+
68
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
69
+ try:
70
+ assert isinstance(
71
+ cfg.preprocess.data_augment, list
72
+ ), "Please provide a list of datasets need to be augmented."
73
+ if len(cfg.preprocess.data_augment) > 0:
74
+ new_datasets_list = []
75
+ for dataset in cfg.preprocess.data_augment:
76
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
77
+ new_datasets_list.extend(new_datasets)
78
+ cfg.dataset.extend(new_datasets_list)
79
+ print("Augmentation datasets: ", cfg.dataset)
80
+ except:
81
+ print("No Data Augmentation.")
82
+
83
+ # Dump metadata of datasets (singers, train/test durations, etc.)
84
+ cal_metadata(cfg)
85
+
86
+ ## Prepare the acoustic features
87
+ for dataset in cfg.dataset:
88
+ # Skip augmented datasets which do not need to extract acoustic features
89
+ # We will copy acoustic features from the original dataset later
90
+ if (
91
+ "pitch_shift" in dataset
92
+ or "formant_shift" in dataset
93
+ or "equalizer" in dataset in dataset
94
+ ):
95
+ continue
96
+ print(
97
+ "Extracting acoustic features for {} using {} workers ...".format(
98
+ dataset, args.num_workers
99
+ )
100
+ )
101
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
102
+ # Calculate the statistics of acoustic features
103
+ if cfg.preprocess.mel_min_max_norm:
104
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
105
+
106
+ # Copy acoustic features for augmented datasets by creating soft-links
107
+ for dataset in cfg.dataset:
108
+ if "pitch_shift" in dataset:
109
+ src_dataset = dataset.replace("_pitch_shift", "")
110
+ src_dataset_dir = os.path.join(output_path, src_dataset)
111
+ elif "formant_shift" in dataset:
112
+ src_dataset = dataset.replace("_formant_shift", "")
113
+ src_dataset_dir = os.path.join(output_path, src_dataset)
114
+ elif "equalizer" in dataset:
115
+ src_dataset = dataset.replace("_equalizer", "")
116
+ src_dataset_dir = os.path.join(output_path, src_dataset)
117
+ else:
118
+ continue
119
+ dataset_dir = os.path.join(output_path, dataset)
120
+ metadata = []
121
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
122
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
123
+ with open(metadata_file_path, "r") as f:
124
+ metadata.extend(json.load(f))
125
+ print("Copying acoustic features for {}...".format(dataset))
126
+ acoustic_extractor.copy_acoustic_features(
127
+ metadata, dataset_dir, src_dataset_dir, cfg
128
+ )
129
+ if cfg.preprocess.mel_min_max_norm:
130
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
131
+
132
+ if cfg.preprocess.extract_pitch:
133
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
134
+
135
+
136
+ def main():
137
+ parser = argparse.ArgumentParser()
138
+ parser.add_argument(
139
+ "--config", default="config.json", help="json files for configurations."
140
+ )
141
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
142
+
143
+ args = parser.parse_args()
144
+ cfg = load_config(args.config)
145
+
146
+ preprocess(cfg, args)
147
+
148
+
149
+ if __name__ == "__main__":
150
+ main()
bins/vocoder/train.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
11
+ from utils.util import load_config
12
+
13
+
14
+ def build_trainer(args, cfg):
15
+ supported_trainer = {
16
+ "GANVocoder": GANVocoderTrainer,
17
+ }
18
+
19
+ trainer_class = supported_trainer[cfg.model_type]
20
+ trainer = trainer_class(args, cfg)
21
+ return trainer
22
+
23
+
24
+ def cuda_relevant(deterministic=False):
25
+ torch.cuda.empty_cache()
26
+ # TF32 on Ampere and above
27
+ torch.backends.cuda.matmul.allow_tf32 = True
28
+ torch.backends.cudnn.enabled = True
29
+ torch.backends.cudnn.allow_tf32 = True
30
+ # Deterministic
31
+ torch.backends.cudnn.deterministic = deterministic
32
+ torch.backends.cudnn.benchmark = not deterministic
33
+ torch.use_deterministic_algorithms(deterministic)
34
+
35
+
36
+ def main():
37
+ parser = argparse.ArgumentParser()
38
+ parser.add_argument(
39
+ "--config",
40
+ default="config.json",
41
+ help="json files for configurations.",
42
+ required=True,
43
+ )
44
+ parser.add_argument(
45
+ "--exp_name",
46
+ type=str,
47
+ default="exp_name",
48
+ help="A specific name to note the experiment",
49
+ required=True,
50
+ )
51
+ parser.add_argument(
52
+ "--resume_type",
53
+ type=str,
54
+ help="resume for continue to train, finetune for finetuning",
55
+ )
56
+ parser.add_argument(
57
+ "--checkpoint",
58
+ type=str,
59
+ help="checkpoint to resume",
60
+ )
61
+ parser.add_argument(
62
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
63
+ )
64
+ args = parser.parse_args()
65
+ cfg = load_config(args.config)
66
+
67
+ # Data Augmentation
68
+ if cfg.preprocess.data_augment:
69
+ new_datasets_list = []
70
+ for dataset in cfg.preprocess.data_augment:
71
+ new_datasets = [
72
+ # f"{dataset}_pitch_shift",
73
+ # f"{dataset}_formant_shift",
74
+ f"{dataset}_equalizer",
75
+ f"{dataset}_time_stretch",
76
+ ]
77
+ new_datasets_list.extend(new_datasets)
78
+ cfg.dataset.extend(new_datasets_list)
79
+
80
+ # CUDA settings
81
+ cuda_relevant()
82
+
83
+ # Build trainer
84
+ trainer = build_trainer(args, cfg)
85
+
86
+ trainer.train_loop()
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": false,
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "data_augment": false,
122
+ "align_mel_duration": false
123
+ },
124
+ "train": {
125
+ "ddp": true,
126
+ "random_seed": 970227,
127
+ "batch_size": 16,
128
+ "max_steps": 1000000,
129
+ // Trackers
130
+ "tracker": [
131
+ "tensorboard"
132
+ // "wandb",
133
+ // "cometml",
134
+ // "mlflow",
135
+ ],
136
+ "max_epoch": -1,
137
+ // -1 means no limit
138
+ "save_checkpoint_stride": [
139
+ 5,
140
+ 20
141
+ ],
142
+ // unit is epoch
143
+ "keep_last": [
144
+ 3,
145
+ -1
146
+ ],
147
+ // -1 means infinite, if one number will broadcast
148
+ "run_eval": [
149
+ false,
150
+ true
151
+ ],
152
+ // if one number will broadcast
153
+ // Fix the random seed
154
+ "random_seed": 10086,
155
+ // Optimizer
156
+ "optimizer": "AdamW",
157
+ "adamw": {
158
+ "lr": 4.0e-4
159
+ // nn model lr
160
+ },
161
+ // LR Scheduler
162
+ "scheduler": "ReduceLROnPlateau",
163
+ "reducelronplateau": {
164
+ "factor": 0.8,
165
+ "patience": 10,
166
+ // unit is epoch
167
+ "min_lr": 1.0e-4
168
+ },
169
+ // Batchsampler
170
+ "sampler": {
171
+ "holistic_shuffle": true,
172
+ "drop_last": true
173
+ },
174
+ // Dataloader
175
+ "dataloader": {
176
+ "num_worker": 32,
177
+ "pin_memory": true
178
+ },
179
+ "gradient_accumulation_step": 1,
180
+ "total_training_steps": 50000,
181
+ "save_summary_steps": 500,
182
+ "save_checkpoints_steps": 10000,
183
+ "valid_interval": 10000,
184
+ "keep_checkpoint_max": 5,
185
+ "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
+ "max_epoch": -1,
187
+ // -1 means no limit
188
+ "save_checkpoint_stride": [
189
+ 5,
190
+ 20
191
+ ],
192
+ // unit is epoch
193
+ "keep_last": [
194
+ 3,
195
+ -1
196
+ ],
197
+ // -1 means infinite, if one number will broadcast
198
+ "run_eval": [
199
+ false,
200
+ true
201
+ ],
202
+ // Batchsampler
203
+ "sampler": {
204
+ "holistic_shuffle": true,
205
+ "drop_last": true
206
+ },
207
+ // Dataloader
208
+ "dataloader": {
209
+ "num_worker": 32,
210
+ "pin_memory": true
211
+ },
212
+ // Trackers
213
+ "tracker": [
214
+ "tensorboard"
215
+ // "wandb",
216
+ // "cometml",
217
+ // "mlflow",
218
+ ],
219
+ },
220
+ }
config/comosvc.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "teacher_model_path": "[Your Teacher Model Path].bin",
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ "comosvc": {
116
+ "distill": false,
117
+ // conformer encoder
118
+ "input_dim": 384,
119
+ "output_dim": 100,
120
+ "n_heads": 2,
121
+ "n_layers": 6,
122
+ "filter_channels": 512,
123
+ "dropout": 0.1,
124
+ // karras diffusion
125
+ "P_mean": -1.2,
126
+ "P_std": 1.2,
127
+ "sigma_data": 0.5,
128
+ "sigma_min": 0.002,
129
+ "sigma_max": 80,
130
+ "rho": 7,
131
+ "n_timesteps": 40,
132
+ },
133
+ "diffusion": {
134
+ // Diffusion steps encoder
135
+ "step_encoder": {
136
+ "dim_raw_embedding": 128,
137
+ "dim_hidden_layer": 512,
138
+ "activation": "SiLU",
139
+ "num_layer": 2,
140
+ "max_period": 10000
141
+ },
142
+ // Diffusion decoder
143
+ "model_type": "bidilconv",
144
+ // bidilconv, unet2d, TODO: unet1d
145
+ "bidilconv": {
146
+ "base_channel": 384,
147
+ "n_res_block": 20,
148
+ "conv_kernel_size": 3,
149
+ "dilation_cycle_length": 4,
150
+ // specially, 1 means no dilation
151
+ "conditioner_size": 100
152
+ }
153
+ },
154
+ },
155
+ "train": {
156
+ // Basic settings
157
+ "fast_steps": 0,
158
+ "batch_size": 32,
159
+ "gradient_accumulation_step": 1,
160
+ "max_epoch": -1,
161
+ // -1 means no limit
162
+ "save_checkpoint_stride": [
163
+ 10,
164
+ 100
165
+ ],
166
+ // unit is epoch
167
+ "keep_last": [
168
+ 3,
169
+ -1
170
+ ],
171
+ // -1 means infinite, if one number will broadcast
172
+ "run_eval": [
173
+ false,
174
+ true
175
+ ],
176
+ // if one number will broadcast
177
+ // Fix the random seed
178
+ "random_seed": 10086,
179
+ // Batchsampler
180
+ "sampler": {
181
+ "holistic_shuffle": true,
182
+ "drop_last": true
183
+ },
184
+ // Dataloader
185
+ "dataloader": {
186
+ "num_worker": 32,
187
+ "pin_memory": true
188
+ },
189
+ // Trackers
190
+ "tracker": [
191
+ "tensorboard"
192
+ // "wandb",
193
+ // "cometml",
194
+ // "mlflow",
195
+ ],
196
+ // Optimizer
197
+ "optimizer": "AdamW",
198
+ "adamw": {
199
+ "lr": 4.0e-4
200
+ // nn model lr
201
+ },
202
+ // LR Scheduler
203
+ "scheduler": "ReduceLROnPlateau",
204
+ "reducelronplateau": {
205
+ "factor": 0.8,
206
+ "patience": 10,
207
+ // unit is epoch
208
+ "min_lr": 1.0e-4
209
+ }
210
+ },
211
+ "inference": {
212
+ "comosvc": {
213
+ "inference_steps": 40
214
+ }
215
+ }
216
+ }
config/diffusion.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // FIXME: THESE ARE LEGACY
3
+ "base_config": "config/base.json",
4
+ "model_type": "diffusion",
5
+ "task_type": "svc",
6
+ "use_custom_dataset": false,
7
+ "preprocess": {
8
+ // data augmentations
9
+ "use_pitch_shift": false,
10
+ "use_formant_shift": false,
11
+ "use_time_stretch": false,
12
+ "use_equalizer": false,
13
+ // acoustic features
14
+ "extract_mel": true,
15
+ "mel_min_max_norm": true,
16
+ "extract_pitch": true,
17
+ "pitch_extractor": "parselmouth",
18
+ "extract_uv": true,
19
+ "extract_energy": true,
20
+ // content features
21
+ "extract_whisper_feature": false,
22
+ "whisper_sample_rate": 16000,
23
+ "extract_contentvec_feature": false,
24
+ "contentvec_sample_rate": 16000,
25
+ "extract_wenet_feature": false,
26
+ "wenet_sample_rate": 16000,
27
+ "extract_mert_feature": false,
28
+ "mert_sample_rate": 16000,
29
+ // Default config for whisper
30
+ "whisper_frameshift": 0.01,
31
+ "whisper_downsample_rate": 2,
32
+ // Default config for content vector
33
+ "contentvec_frameshift": 0.02,
34
+ // Default config for mert
35
+ "mert_model": "m-a-p/MERT-v1-330M",
36
+ "mert_feature_layer": -1,
37
+ "mert_hop_size": 320,
38
+ // 24k
39
+ "mert_frameshit": 0.01333,
40
+ // 10ms
41
+ "wenet_frameshift": 0.01,
42
+ // wenetspeech is 4, gigaspeech is 6
43
+ "wenet_downsample_rate": 4,
44
+ // Default config
45
+ "n_mel": 100,
46
+ "win_size": 1024,
47
+ // todo
48
+ "hop_size": 256,
49
+ "sample_rate": 24000,
50
+ "n_fft": 1024,
51
+ // todo
52
+ "fmin": 0,
53
+ "fmax": 12000,
54
+ // todo
55
+ "f0_min": 50,
56
+ // ~C2
57
+ "f0_max": 1100,
58
+ //1100, // ~C6(1100), ~G5(800)
59
+ "pitch_bin": 256,
60
+ "pitch_max": 1100.0,
61
+ "pitch_min": 50.0,
62
+ "is_label": true,
63
+ "is_mu_law": true,
64
+ "bits": 8,
65
+ "mel_min_max_stats_dir": "mel_min_max_stats",
66
+ "whisper_dir": "whisper",
67
+ "contentvec_dir": "contentvec",
68
+ "wenet_dir": "wenet",
69
+ "mert_dir": "mert",
70
+ // Extract content features using dataloader
71
+ "pin_memory": true,
72
+ "num_workers": 8,
73
+ "content_feature_batch_size": 16,
74
+ // Features used for model training
75
+ "use_mel": true,
76
+ "use_min_max_norm_mel": true,
77
+ "use_frame_pitch": true,
78
+ "use_uv": true,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ // FIXME: FOLLOWING ARE NEW!!
116
+ "diffusion": {
117
+ "scheduler": "ddpm",
118
+ "scheduler_settings": {
119
+ "num_train_timesteps": 1000,
120
+ "beta_start": 1.0e-4,
121
+ "beta_end": 0.02,
122
+ "beta_schedule": "linear"
123
+ },
124
+ // Diffusion steps encoder
125
+ "step_encoder": {
126
+ "dim_raw_embedding": 128,
127
+ "dim_hidden_layer": 512,
128
+ "activation": "SiLU",
129
+ "num_layer": 2,
130
+ "max_period": 10000
131
+ },
132
+ // Diffusion decoder
133
+ "model_type": "bidilconv",
134
+ // bidilconv, unet2d, TODO: unet1d
135
+ "bidilconv": {
136
+ "base_channel": 384,
137
+ "n_res_block": 20,
138
+ "conv_kernel_size": 3,
139
+ "dilation_cycle_length": 4,
140
+ // specially, 1 means no dilation
141
+ "conditioner_size": 384
142
+ },
143
+ "unet2d": {
144
+ "in_channels": 1,
145
+ "out_channels": 1,
146
+ "down_block_types": [
147
+ "CrossAttnDownBlock2D",
148
+ "CrossAttnDownBlock2D",
149
+ "CrossAttnDownBlock2D",
150
+ "DownBlock2D"
151
+ ],
152
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
153
+ "up_block_types": [
154
+ "UpBlock2D",
155
+ "CrossAttnUpBlock2D",
156
+ "CrossAttnUpBlock2D",
157
+ "CrossAttnUpBlock2D"
158
+ ],
159
+ "only_cross_attention": false
160
+ }
161
+ }
162
+ },
163
+ // FIXME: FOLLOWING ARE NEW!!
164
+ "train": {
165
+ // Basic settings
166
+ "batch_size": 64,
167
+ "gradient_accumulation_step": 1,
168
+ "max_epoch": -1,
169
+ // -1 means no limit
170
+ "save_checkpoint_stride": [
171
+ 5,
172
+ 20
173
+ ],
174
+ // unit is epoch
175
+ "keep_last": [
176
+ 3,
177
+ -1
178
+ ],
179
+ // -1 means infinite, if one number will broadcast
180
+ "run_eval": [
181
+ false,
182
+ true
183
+ ],
184
+ // if one number will broadcast
185
+ // Fix the random seed
186
+ "random_seed": 10086,
187
+ // Batchsampler
188
+ "sampler": {
189
+ "holistic_shuffle": true,
190
+ "drop_last": true
191
+ },
192
+ // Dataloader
193
+ "dataloader": {
194
+ "num_worker": 32,
195
+ "pin_memory": true
196
+ },
197
+ // Trackers
198
+ "tracker": [
199
+ "tensorboard"
200
+ // "wandb",
201
+ // "cometml",
202
+ // "mlflow",
203
+ ],
204
+ // Optimizer
205
+ "optimizer": "AdamW",
206
+ "adamw": {
207
+ "lr": 4.0e-4
208
+ // nn model lr
209
+ },
210
+ // LR Scheduler
211
+ "scheduler": "ReduceLROnPlateau",
212
+ "reducelronplateau": {
213
+ "factor": 0.8,
214
+ "patience": 10,
215
+ // unit is epoch
216
+ "min_lr": 1.0e-4
217
+ }
218
+ },
219
+ "inference": {
220
+ "diffusion": {
221
+ "scheduler": "pndm",
222
+ "scheduler_settings": {
223
+ "num_inference_timesteps": 1000
224
+ }
225
+ }
226
+ }
227
+ }
config/fs2.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": true,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_min_max_norm_mel": false,
54
+ "use_frame_pitch": false,
55
+ "use_frame_energy": false,
56
+ "use_phone_pitch": true,
57
+ "use_phone_energy": true,
58
+ "use_log_scale_pitch": false,
59
+ "use_log_scale_energy": false,
60
+ "use_spkid": false,
61
+ "align_mel_duration": true,
62
+ "text_cleaners": ["english_cleaners"]
63
+ },
64
+ "model": {
65
+ // Settings for transformer
66
+ "transformer": {
67
+ "encoder_layer": 4,
68
+ "encoder_head": 2,
69
+ "encoder_hidden": 256,
70
+ "decoder_layer": 6,
71
+ "decoder_head": 2,
72
+ "decoder_hidden": 256,
73
+ "conv_filter_size": 1024,
74
+ "conv_kernel_size": [9, 1],
75
+ "encoder_dropout": 0.2,
76
+ "decoder_dropout": 0.2
77
+ },
78
+
79
+ // Settings for variance_predictor
80
+ "variance_predictor":{
81
+ "filter_size": 256,
82
+ "kernel_size": 3,
83
+ "dropout": 0.5
84
+ },
85
+ "variance_embedding":{
86
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88
+ "n_bins": 256
89
+ },
90
+ "max_seq_len": 1000
91
+ },
92
+ "train":{
93
+ "batch_size": 16,
94
+ "sort_sample": true,
95
+ "drop_last": true,
96
+ "group_size": 4,
97
+ "grad_clip_thresh": 1.0,
98
+ "dataloader": {
99
+ "num_worker": 8,
100
+ "pin_memory": true
101
+ },
102
+ "lr_scheduler":{
103
+ "num_warmup": 4000
104
+ },
105
+ // LR Scheduler
106
+ "scheduler": "NoamLR",
107
+ // Optimizer
108
+ "optimizer": "Adam",
109
+ "adam": {
110
+ "lr": 0.0625,
111
+ "betas": [0.9, 0.98],
112
+ "eps": 0.000000001,
113
+ "weight_decay": 0.0
114
+ },
115
+ }
116
+
117
+ }
config/transformer.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": true,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "transformer": {
115
+ "type": "conformer",
116
+ // 'conformer' or 'transformer'
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ }
124
+ },
125
+ "train": {
126
+ // Basic settings
127
+ "batch_size": 64,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1,
130
+ // -1 means no limit
131
+ "save_checkpoint_stride": [
132
+ 10,
133
+ 100
134
+ ],
135
+ // unit is epoch
136
+ "keep_last": [
137
+ 3,
138
+ -1
139
+ ],
140
+ // -1 means infinite, if one number will broadcast
141
+ "run_eval": [
142
+ false,
143
+ true
144
+ ],
145
+ // if one number will broadcast
146
+ // Fix the random seed
147
+ "random_seed": 10086,
148
+ // Batchsampler
149
+ "sampler": {
150
+ "holistic_shuffle": true,
151
+ "drop_last": true
152
+ },
153
+ // Dataloader
154
+ "dataloader": {
155
+ "num_worker": 32,
156
+ "pin_memory": true
157
+ },
158
+ // Trackers
159
+ "tracker": [
160
+ "tensorboard"
161
+ // "wandb",
162
+ // "cometml",
163
+ // "mlflow",
164
+ ],
165
+ // Optimizer
166
+ "optimizer": "AdamW",
167
+ "adamw": {
168
+ "lr": 4.0e-4
169
+ // nn model lr
170
+ },
171
+ // LR Scheduler
172
+ "scheduler": "ReduceLROnPlateau",
173
+ "reducelronplateau": {
174
+ "factor": 0.8,
175
+ "patience": 10,
176
+ // unit is epoch
177
+ "min_lr": 1.0e-4
178
+ }
179
+ }
180
+ }
config/tts.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ ],
8
+ "task_type": "tts",
9
+ "preprocess": {
10
+ "language": "en-us",
11
+ // linguistic features
12
+ "extract_phone": true,
13
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
+ // Directory names of processed data or extracted features
16
+ "phone_dir": "phones",
17
+ "use_phone": true,
18
+ // "spk2id": "spk2id.json", // used for multi-speaker dataset
19
+ // "utt2spk": "utt2spk", // used for multi-speaker dataset
20
+ "add_blank": true
21
+ },
22
+ "model": {
23
+ "text_token_num": 512,
24
+ }
25
+
26
+ }
config/valle.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sampling_rate": 24000,
21
+ },
22
+ "model": {
23
+ "text_token_num": 512,
24
+ "audio_token_num": 1024,
25
+ "decoder_dim": 1024, // embedding dimension of the decoder model
26
+ "nhead": 16, // number of attention heads in the decoder layers
27
+ "num_decoder_layers": 12, // number of decoder layers
28
+ "norm_first": true, // pre or post Normalization.
29
+ "add_prenet": false, // whether add PreNet after Inputs
30
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34
+ "num_quantizers": 8, // numbert of the audio quantization layers
35
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36
+ },
37
+ "train": {
38
+ "ddp": false,
39
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40
+ "max_epoch": 20,
41
+ "optimizer": "ScaledAdam",
42
+ "scheduler": "Eden",
43
+ "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44
+ "base_lr": 0.05, // base learning rate."
45
+ "valid_interval": 1000,
46
+ "log_epoch_step": 1000,
47
+ "save_checkpoint_stride": [
48
+ 1,
49
+ 1
50
+ ]
51
+ }
52
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 10, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 4,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }
egs/datasets/README.md ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Format
2
+
3
+ Amphion support the following academic datasets (sort alphabetically):
4
+
5
+ - [Datasets Format](#datasets-format)
6
+ - [AudioCaps](#audiocaps)
7
+ - [CSD](#csd)
8
+ - [KiSing](#kising)
9
+ - [LibriTTS](#libritts)
10
+ - [LJSpeech](#ljspeech)
11
+ - [M4Singer](#m4singer)
12
+ - [NUS-48E](#nus-48e)
13
+ - [Opencpop](#opencpop)
14
+ - [OpenSinger](#opensinger)
15
+ - [Opera](#opera)
16
+ - [PopBuTFy](#popbutfy)
17
+ - [PopCS](#popcs)
18
+ - [PJS](#pjs)
19
+ - [SVCC](#svcc)
20
+ - [VCTK](#vctk)
21
+
22
+ The downloading link and the file structure tree of each dataset is displayed as follows.
23
+
24
+ ## AudioCaps
25
+
26
+ AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
27
+
28
+ ```plaintext
29
+ [AudioCaps dataset path]
30
+ ┣ AudioCpas
31
+ ┃   ┣ wav
32
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
33
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
34
+ ┃ ┃ ┣ ...
35
+ ```
36
+
37
+ ## CSD
38
+
39
+ The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
40
+
41
+ ```plaintext
42
+ [CSD dataset path]
43
+ ┣ english
44
+ ┣ korean
45
+ ┣ utterances
46
+ ┃ ┣ en001a
47
+ ┃ ┃ ┣ {UtterenceID}.wav
48
+ ┃ ┣ en001b
49
+ ┃ ┣ en002a
50
+ ┃ ┣ en002b
51
+ ┃ ┣ ...
52
+ ┣ README
53
+ ```
54
+
55
+ ## KiSing
56
+
57
+ The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
58
+
59
+ ```plaintext
60
+ [KiSing dataset path]
61
+ ┣ clean
62
+ ┃ ┣ 421
63
+ ┃ ┣ 422
64
+ ┃ ┣ ...
65
+ ```
66
+
67
+ ## LibriTTS
68
+
69
+ The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
70
+
71
+ ```plaintext
72
+ [LibriTTS dataset path]
73
+ ┣ BOOKS.txt
74
+ ┣ CHAPTERS.txt
75
+ ┣ eval_sentences10.tsv
76
+ ┣ LICENSE.txt
77
+ ┣ NOTE.txt
78
+ ┣ reader_book.tsv
79
+ ┣ README_librispeech.txt
80
+ ┣ README_libritts.txt
81
+ ┣ speakers.tsv
82
+ ┣ SPEAKERS.txt
83
+ ┣ dev-clean (Subset)
84
+ ┃ ┣ 1272{Speaker_ID}
85
+ ┃ ┃ ┣ 128104 {Chapter_ID}
86
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
87
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
88
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
89
+ ┃ ┃ ┃ ┣ ...
90
+ ┃ ┃ ┃ ┣ 1272_128104.book.tsv
91
+ ┃ ┃ ┃ ┣ 1272_128104.trans.tsv
92
+ ┃ ┃ ┣ ...
93
+ ┃ ┣ ...
94
+ ┣ dev-other (Subset)
95
+ ┃ ┣ 116 (Speaker)
96
+ ┃ ┃ ┣ 288045 {Chapter_ID}
97
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
98
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
99
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
100
+ ┃ ┃ ┃ ┣ ...
101
+ ┃ ┃ ┃ ┣ 116_288045.book.tsv
102
+ ┃ ┃ ┃ ┣ 116_288045.trans.tsv
103
+ ┃ ┃ ┣ ...
104
+ ┃ ┣ ...
105
+ ┃ ┣ ...
106
+ ┣ test-clean (Subset)
107
+ ┃ ┣ {Speaker_ID}
108
+ ┃ ┃ ┣ {Chapter_ID}
109
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
110
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
111
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
112
+ ┃ ┃ ┃ ┣ ...
113
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
114
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
115
+ ┃ ┃ ┣ ...
116
+ ┃ ┣ ...
117
+ ┣ test-other
118
+ ┃ ┣ {Speaker_ID}
119
+ ┃ ┃ ┣ {Chapter_ID}
120
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
121
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
122
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
123
+ ┃ ┃ ┃ ┣ ...
124
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
125
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
126
+ ┃ ┃ ┣ ...
127
+ ┃ ┣ ...
128
+ ┣ train-clean-100
129
+ ┃ ┣ {Speaker_ID}
130
+ ┃ ┃ ┣ {Chapter_ID}
131
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
132
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
133
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
134
+ ┃ ┃ ┃ ┣ ...
135
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
136
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
137
+ ┃ ┃ ┣ ...
138
+ ┃ ┣ ...
139
+ ┣ train-clean-360
140
+ ┃ ┣ {Speaker_ID}
141
+ ┃ ┃ ┣ {Chapter_ID}
142
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
143
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
144
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
145
+ ┃ ┃ ┃ ┣ ...
146
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
147
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
148
+ ┃ ┃ ┣ ...
149
+ ┃ ┣ ...
150
+ ┣ train-other-500
151
+ ┃ ┣ {Speaker_ID}
152
+ ┃ ┃ ┣ {Chapter_ID}
153
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
154
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
155
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
156
+ ┃ ┃ ┃ ┣ ...
157
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
158
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
159
+ ┃ ┃ ┣ ...
160
+ ┃ ┣ ...
161
+ ```
162
+
163
+
164
+ ## LJSpeech
165
+
166
+ The official LibriTTS dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
167
+
168
+ ```plaintext
169
+ [LJSpeech dataset path]
170
+ ┣ metadata.csv
171
+ ┣ wavs
172
+ ┃ ┣ LJ001-0001.wav
173
+ ┃ ┣ LJ001-0002.wav
174
+ ┃ ┣ ...
175
+ ┣ README
176
+ ```
177
+
178
+ ## M4Singer
179
+
180
+ The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
181
+
182
+ ```plaintext
183
+ [M4Singer dataset path]
184
+ ┣ {Singer_1}#{Song_1}
185
+ ┃ ┣ 0000.mid
186
+ ┃ ┣ 0000.TextGrid
187
+ ┃ ┣ 0000.wav
188
+ ┃ ┣ ...
189
+ ┣ {Singer_1}#{Song_2}
190
+ ┣ ...
191
+ ┣ {Singer_2}#{Song_1}
192
+ ┣ {Singer_2}#{Song_2}
193
+ ┣ ...
194
+ ┗ meta.json
195
+ ```
196
+
197
+ ## NUS-48E
198
+
199
+ The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
200
+
201
+ ```plaintext
202
+ [NUS-48E dataset path]
203
+ ┣ {SpeakerID}
204
+ ┃ ┣ read
205
+ ┃ ┃ ┣ {SongID}.txt
206
+ ┃ ┃ ┣ {SongID}.wav
207
+ ┃ ┃ ┣ ...
208
+ ┃ ┣ sing
209
+ ┃ ┃ ┣ {SongID}.txt
210
+ ┃ ┃ ┣ {SongID}.wav
211
+ ┃ ┃ ┣ ...
212
+ ┣ ...
213
+ ┣ README.txt
214
+
215
+ ```
216
+
217
+ ## Opencpop
218
+
219
+ The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
220
+
221
+ ```plaintext
222
+ [Opencpop dataset path]
223
+ ┣ midis
224
+ ┃ ┣ 2001.midi
225
+ ┃ ┣ 2002.midi
226
+ ┃ ┣ 2003.midi
227
+ ┃ ┣ ...
228
+ ┣ segments
229
+ ┃ ┣ wavs
230
+ ┃ ┃ ┣ 2001000001.wav
231
+ ┃ ┃ ┣ 2001000002.wav
232
+ ┃ ┃ ┣ 2001000003.wav
233
+ ┃ ┃ ┣ ...
234
+ ┃ ┣ test.txt
235
+ ┃ ┣ train.txt
236
+ ┃ ┗ transcriptions.txt
237
+ ┣ textgrids
238
+ ┃ ┣ 2001.TextGrid
239
+ ┃ ┣ 2002.TextGrid
240
+ ┃ ┣ 2003.TextGrid
241
+ ┃ ┣ ...
242
+ ┣ wavs
243
+ ┃ ┣ 2001.wav
244
+ ┃ ┣ 2002.wav
245
+ ┃ ┣ 2003.wav
246
+ ┃ ┣ ...
247
+ ┣ TERMS_OF_ACCESS
248
+ ┗ readme.md
249
+ ```
250
+
251
+ ## OpenSinger
252
+
253
+ The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
254
+
255
+ ```plaintext
256
+ [OpenSinger dataset path]
257
+ ┣ ManRaw
258
+ ┃ ┣ {Singer_1}_{Song_1}
259
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
260
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
261
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
262
+ ┃ ┃ ┣ ...
263
+ ┃ ┣ {Singer_1}_{Song_2}
264
+ ┃ ┣ ...
265
+ ┣ WomanRaw
266
+ ┣ LICENSE
267
+ ┗ README.md
268
+ ```
269
+
270
+ ## Opera
271
+
272
+ The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
273
+
274
+ ```plaintext
275
+ [Opera dataset path]
276
+ ┣ monophonic
277
+ ┃ ┣ chinese
278
+ ┃ ┃ ┣ {Gender}_{SingerID}
279
+ ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
280
+ ┃ ┃ ┃ ┣ ...
281
+ ┃ ┃ ┣ ...
282
+ ┃ ┣ western
283
+ ┣ polyphonic
284
+ ┃ ┣ chinese
285
+ ┃ ┣ western
286
+ ┣ CrossculturalDataSet.xlsx
287
+ ```
288
+
289
+ ## PopBuTFy
290
+
291
+ The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
292
+
293
+ ```plaintext
294
+ [PopBuTFy dataset path]
295
+ ┣ data
296
+ ┃ ┣ {SingerID}#singing#{SongName}_Amateur
297
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
298
+ ┃ ┃ ┣ ...
299
+ ┃ ┣ {SingerID}#singing#{SongName}_Professional
300
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
301
+ ┃ ┃ ┣ ...
302
+ ┣ text_labels
303
+ ┗ TERMS_OF_ACCESS
304
+ ```
305
+
306
+ ## PopCS
307
+
308
+ The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
309
+
310
+ ```plaintext
311
+ [PopCS dataset path]
312
+ ┣ popcs
313
+ ┃ ┣ popcs-{SongName}
314
+ ┃ ┃ ┣ {UtteranceID}_ph.txt
315
+ ┃ ┃ ┣ {UtteranceID}_wf0.wav
316
+ ┃ ┃ ┣ {UtteranceID}.TextGrid
317
+ ┃ ┃ ┣ {UtteranceID}.txt
318
+ ┃ ┃ ┣ ...
319
+ ┃ ┣ ...
320
+ ┗ TERMS_OF_ACCESS
321
+ ```
322
+
323
+ ## PJS
324
+
325
+ The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
326
+
327
+ ```plaintext
328
+ [PJS dataset path]
329
+ ┣ PJS_corpus_ver1.1
330
+ ┃ ┣ background_noise
331
+ ┃ ┣ pjs{SongID}
332
+ ┃ ┃ ┣ pjs{SongID}_song.wav
333
+ ┃ ┃ ┣ pjs{SongID}_speech.wav
334
+ ┃ ┃ ┣ pjs{SongID}.lab
335
+ ┃ ┃ ┣ pjs{SongID}.mid
336
+ ┃ ┃ ┣ pjs{SongID}.musicxml
337
+ ┃ ┃ ┣ pjs{SongID}.txt
338
+ ┃ ┣ ...
339
+ ```
340
+
341
+ ## SVCC
342
+
343
+ The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
344
+
345
+ ```plaintext
346
+ [SVCC dataset path]
347
+ ┣ Data
348
+ ┃ ┣ CDF1
349
+ ┃ ┃ ┣ 10001.wav
350
+ ┃ ┃ ┣ 10002.wav
351
+ ┃ ┃ ┣ ...
352
+ ┃ ┣ CDM1
353
+ ┃ ┣ IDF1
354
+ ┃ ┣ IDM1
355
+ ┗ README.md
356
+ ```
357
+
358
+ ## VCTK
359
+
360
+ The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
361
+
362
+ ```plaintext
363
+ [VCTK dataset path]
364
+ ┣ txt
365
+ ┃ ┣ {Speaker_1}
366
+ ┃ ┃ ┣ {Speaker_1}_001.txt
367
+ ┃ ┃ ┣ {Speaker_1}_002.txt
368
+ ┃ ┃ ┣ ...
369
+ ┃ ┣ {Speaker_2}
370
+ ┃ ┣ ...
371
+ ┣ wav48_silence_trimmed
372
+ ┃ ┣ {Speaker_1}
373
+ ┃ ┃ ┣ {Speaker_1}_001_mic1.flac
374
+ ┃ ┃ ┣ {Speaker_1}_001_mic2.flac
375
+ ┃ ┃ ┣ {Speaker_1}_002_mic1.flac
376
+ ┃ ┃ ┣ ...
377
+ ┃ ┣ {Speaker_2}
378
+ ┃ ┣ ...
379
+ ┣ speaker-info.txt
380
+ ┗ update.txt
381
+ ```
egs/metrics/README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Evaluation Recipe
2
+
3
+ ## Supported Evaluation Metrics
4
+
5
+ Until now, Amphion Evaluation has supported the following objective metrics:
6
+
7
+ - **F0 Modeling**:
8
+ - F0 Pearson Coefficients (FPC)
9
+ - F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
10
+ - F0 Root Mean Square Error (F0RMSE)
11
+ - Voiced/Unvoiced F1 Score (V/UV F1)
12
+ - **Energy Modeling**:
13
+ - Energy Root Mean Square Error (EnergyRMSE)
14
+ - Energy Pearson Coefficients (EnergyPC)
15
+ - **Intelligibility**:
16
+ - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
17
+ - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
18
+ - **Spectrogram Distortion**:
19
+ - Frechet Audio Distance (FAD)
20
+ - Mel Cepstral Distortion (MCD)
21
+ - Multi-Resolution STFT Distance (MSTFT)
22
+ - Perceptual Evaluation of Speech Quality (PESQ)
23
+ - Short Time Objective Intelligibility (STOI)
24
+ - Scale Invariant Signal to Distortion Ratio (SISDR)
25
+ - Scale Invariant Signal to Noise Ratio (SISNR)
26
+ - **Speaker Similarity**:
27
+ - Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
28
+ - Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨‍💻 developing)
29
+
30
+ We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
31
+
32
+ 1. Pretrained Models Preparation
33
+ 2. Audio Data Preparation
34
+ 3. Evaluation
35
+
36
+ ## 1. Pretrained Models Preparation
37
+
38
+ If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
39
+
40
+ ## 2. Aduio Data Preparation
41
+
42
+ Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
43
+
44
+ ```plaintext
45
+ ┣ {ref_dir}
46
+ ┃ ┣ sample1.wav
47
+ ┃ ┣ sample2.wav
48
+ ┣ {gen_dir}
49
+ ┃ ┣ sample1.wav
50
+ ┃ ┣ sample2.wav
51
+ ```
52
+
53
+ You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
54
+
55
+ ## 3. Evaluation
56
+
57
+ Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
58
+
59
+ ```bash
60
+ cd Amphion
61
+ sh egs/metrics/run.sh \
62
+ --reference_folder [Your path to the reference audios] \
63
+ --generated_folder [Your path to the generated audios] \
64
+ --dump_folder [Your path to dump the objective results] \
65
+ --metrics [The metrics you need] \
66
+ ```
67
+
68
+ As for the metrics, an example is provided below:
69
+
70
+ ```bash
71
+ --metrics "mcd pesq fad"
72
+ ```
73
+
74
+ All currently available metrics keywords are listed below:
75
+
76
+ | Keys | Description |
77
+ | --------------------- | ------------------------------------------ |
78
+ | `fpc` | F0 Pearson Coefficients |
79
+ | `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
80
+ | `f0rmse` | F0 Root Mean Square Error |
81
+ | `v_uv_f1` | Voiced/Unvoiced F1 Score |
82
+ | `energy_rmse` | Energy Root Mean Square Error |
83
+ | `energy_pc` | Energy Pearson Coefficients |
84
+ | `cer` | Character Error Rate |
85
+ | `wer` | Word Error Rate |
86
+ | `speaker_similarity` | Cos Similarity based on RawNet3 |
87
+ | `fad` | Frechet Audio Distance |
88
+ | `mcd` | Mel Cepstral Distortion |
89
+ | `mstft` | Multi-Resolution STFT Distance |
90
+ | `pesq` | Perceptual Evaluation of Speech Quality |
91
+ | `si_sdr` | Scale Invariant Signal to Distortion Ratio |
92
+ | `si_snr` | Scale Invariant Signal to Noise Ratio |
93
+ | `stoi` | Short Time Objective Intelligibility |
egs/metrics/run.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $exp_dir))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Reference Audio Folder
21
+ --reference_folder) shift; ref_dir=$1 ; shift ;;
22
+ # Generated Audio Folder
23
+ --generated_folder) shift; deg_dir=$1 ; shift ;;
24
+ # Result Dumping Folder
25
+ --dump_folder) shift; dump_dir=$1 ; shift ;;
26
+ # Metrics to Compute
27
+ --metrics) shift; metrics=$1 ; shift ;;
28
+
29
+ --) shift ; break ;;
30
+ *) echo "Invalid option: $1" exit 1 ;;
31
+ esac
32
+ done
33
+
34
+ ######## Calculate Objective Metrics ###########
35
+ CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
36
+ --ref_dir $ref_dir
37
+ --deg_dir $deg_dir
38
+ --dump_dir $dump_dir
39
+ --metrics $metrics
40
+ --fs
egs/svc/DiffComoSVC/README.md ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
2
+ <br>
3
+ <div align="center">
4
+ <img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
5
+ </div>
6
+ <br>
7
+
8
+ This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
9
+
10
+ * The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
11
+ * To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
12
+
13
+ There are five stages in total:
14
+
15
+ 1. Data preparation
16
+ 2. Features extraction
17
+ 3. Teacher Model Training
18
+ 4. Consistency Distillation
19
+ 5. Inference/conversion
20
+
21
+ ## 1. Data Preparation
22
+
23
+ ### Dataset Download
24
+
25
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
26
+
27
+ ### Configuration
28
+
29
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
30
+
31
+ ```json
32
+ "dataset": [
33
+ "m4singer",
34
+ "opencpop",
35
+ "opensinger",
36
+ "svcc",
37
+ "vctk"
38
+ ],
39
+ "dataset_path": {
40
+ // TODO: Fill in your dataset path
41
+ "m4singer": "[M4Singer dataset path]",
42
+ "opencpop": "[Opencpop dataset path]",
43
+ "opensinger": "[OpenSinger dataset path]",
44
+ "svcc": "[SVCC dataset path]",
45
+ "vctk": "[VCTK dataset path]"
46
+ },
47
+ ```
48
+
49
+ ## 2. Features Extraction
50
+
51
+ ### Content-based Pretrained Models Download
52
+
53
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
54
+
55
+ ### Configuration
56
+
57
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
58
+
59
+ ```json
60
+ // TODO: Fill in the output log path
61
+ "log_dir": "[Your path to save logs and checkpoints]",
62
+ "preprocess": {
63
+ // TODO: Fill in the output data path
64
+ "processed_dir": "[Your path to save processed data]",
65
+ ...
66
+ },
67
+ ```
68
+
69
+ ### Run
70
+
71
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
72
+
73
+ ```bash
74
+ cd Amphion
75
+ sh egs/svc/DiffComoSVC/run.sh --stage 1
76
+ ```
77
+
78
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
79
+
80
+ ## 3. Teacher Model Training
81
+
82
+ ### Configuration
83
+
84
+ Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
85
+
86
+ ```JSON
87
+ "comosvc":{
88
+ "distill": false,
89
+ // conformer encoder
90
+ "input_dim": 384,
91
+ "output_dim": 100,
92
+ "n_heads": 2,
93
+ "n_layers": 6,
94
+ "filter_channels":512,
95
+ // karras diffusion
96
+ "P_mean": -1.2,
97
+ "P_std": 1.2,
98
+ "sigma_data": 0.5,
99
+ "sigma_min": 0.002,
100
+ "sigma_max": 80,
101
+ "rho": 7,
102
+ "n_timesteps": 40,
103
+ },
104
+ ```
105
+
106
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
107
+
108
+ ```json
109
+ "train": {
110
+ "batch_size": 32,
111
+ ...
112
+ "adamw": {
113
+ "lr": 2.0e-4
114
+ },
115
+ ...
116
+ }
117
+ ```
118
+
119
+ ### Run
120
+
121
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
122
+
123
+ ```bash
124
+ cd Amphion
125
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
126
+ ```
127
+
128
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
129
+
130
+ ```bash
131
+ cd Amphion
132
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
133
+ ```
134
+
135
+ ## 4. Consistency Distillation
136
+
137
+ ### Configuration
138
+
139
+ Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
140
+
141
+ ```JSON
142
+ "model": {
143
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
144
+ ...
145
+ "comosvc":{
146
+ "distill": true,
147
+ // conformer encoder
148
+ "input_dim": 384,
149
+ "output_dim": 100,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "filter_channels":512,
153
+ // karras diffusion
154
+ "P_mean": -1.2,
155
+ "P_std": 1.2,
156
+ "sigma_data": 0.5,
157
+ "sigma_min": 0.002,
158
+ "sigma_max": 80,
159
+ "rho": 7,
160
+ "n_timesteps": 40,
161
+ },
162
+ ```
163
+
164
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
165
+
166
+ ```json
167
+ "train": {
168
+ "batch_size": 32,
169
+ ...
170
+ "adamw": {
171
+ "lr": 2.0e-4
172
+ },
173
+ ...
174
+ }
175
+ ```
176
+
177
+ ### Run
178
+
179
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
180
+
181
+ ```bash
182
+ cd Amphion
183
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
184
+ ```
185
+
186
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
187
+
188
+ ```bash
189
+ cd Amphion
190
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
191
+ ```
192
+
193
+ ## 5. Inference/Conversion
194
+
195
+ ### Pretrained Vocoder Download
196
+
197
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
198
+
199
+ ### Run
200
+
201
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
202
+
203
+ | Parameters | Description | Example |
204
+ | --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
205
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
206
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
207
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
208
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
209
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
210
+
211
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
212
+
213
+ ```bash
214
+ cd Amphion
215
+ sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
216
+ --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
217
+ --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
218
+ --infer_source_audio_dir [Your Audios Folder] \
219
+ --infer_target_speaker "opencpop_female1" \
220
+ --infer_key_shift "autoshift"
221
+ ```
222
+ Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
223
+ ```json
224
+ "inference": {
225
+ "comosvc": {
226
+ "inference_steps": 40
227
+ }
228
+ }
229
+ ```
230
+
231
+ # Reference
232
+ https://github.com/zhenye234/CoMoSpeech
233
+
234
+ https://github.com/openai/consistency_models
egs/svc/DiffComoSVC/exp_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/comosvc.json",
3
+ "model_type": "DiffComoSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path
20
+ "log_dir": "[Your path to save logs and checkpoints]",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path
23
+ "processed_dir": "[Your path to save processed data]",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
53
+ "condition_encoder": {
54
+ // Config for features usage
55
+ "use_whisper": true,
56
+ "use_contentvec": true,
57
+ "use_wenet": false,
58
+ "whisper_dim": 1024,
59
+ "contentvec_dim": 256,
60
+ "wenet_dim": 512,
61
+ "use_singer_encoder": false,
62
+ "pitch_min": 50,
63
+ "pitch_max": 1100
64
+ },
65
+ "comosvc":{
66
+ "distill": false,
67
+ // conformer encoder
68
+ "input_dim": 384,
69
+ "output_dim": 100,
70
+ "n_heads": 2,
71
+ "n_layers": 6,
72
+ "filter_channels":512,
73
+ "dropout":0.1,
74
+ // karras diffusion
75
+ "P_mean": -1.2,
76
+ "P_std": 1.2,
77
+ "sigma_data": 0.5,
78
+ "sigma_min": 0.002,
79
+ "sigma_max": 80,
80
+ "rho": 7,
81
+ "n_timesteps": 40,
82
+ },
83
+ "diffusion": {
84
+ // Diffusion steps encoder
85
+ "step_encoder": {
86
+ "dim_raw_embedding": 128,
87
+ "dim_hidden_layer": 512,
88
+ "activation": "SiLU",
89
+ "num_layer": 2,
90
+ "max_period": 10000
91
+ },
92
+ // Diffusion decoder
93
+ "model_type": "bidilconv",
94
+ // bidilconv, unet2d, TODO: unet1d
95
+ "bidilconv": {
96
+ "base_channel": 384,
97
+ "n_res_block": 20,
98
+ "conv_kernel_size": 3,
99
+ "dilation_cycle_length": 4,
100
+ // specially, 1 means no dilation
101
+ "conditioner_size": 100
102
+ }
103
+ }
104
+ },
105
+ "train": {
106
+ "batch_size": 64,
107
+ "gradient_accumulation_step": 1,
108
+ "max_epoch": -1, // -1 means no limit
109
+ "save_checkpoint_stride": [
110
+ 50,
111
+ 50
112
+ ],
113
+ "keep_last": [
114
+ 5,
115
+ -1
116
+ ],
117
+ "run_eval": [
118
+ false,
119
+ true
120
+ ],
121
+ "adamw": {
122
+ "lr": 4.0e-4
123
+ },
124
+ "reducelronplateau": {
125
+ "factor": 0.8,
126
+ "patience": 10,
127
+ "min_lr": 1.0e-4
128
+ },
129
+ "dataloader": {
130
+ "num_worker": 8,
131
+ "pin_memory": true
132
+ },
133
+ "sampler": {
134
+ "holistic_shuffle": false,
135
+ "drop_last": true
136
+ }
137
+ },
138
+ "inference": {
139
+ "comosvc": {
140
+ "inference_steps": 40
141
+ }
142
+ }
143
+ }
egs/svc/DiffComoSVC/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
+
14
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
+
18
+ There are four stages in total:
19
+
20
+ 1. Data preparation
21
+ 2. Features extraction
22
+ 3. Training
23
+ 4. Inference/conversion
24
+
25
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
+ > ```bash
27
+ > cd Amphion
28
+ > ```
29
+
30
+ ## 1. Data Preparation
31
+
32
+ ### Dataset Download
33
+
34
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
+
36
+ ### Configuration
37
+
38
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
+
40
+ ```json
41
+ "dataset": [
42
+ "m4singer",
43
+ "opencpop",
44
+ "opensinger",
45
+ "svcc",
46
+ "vctk"
47
+ ],
48
+ "dataset_path": {
49
+ // TODO: Fill in your dataset path
50
+ "m4singer": "[M4Singer dataset path]",
51
+ "opencpop": "[Opencpop dataset path]",
52
+ "opensinger": "[OpenSinger dataset path]",
53
+ "svcc": "[SVCC dataset path]",
54
+ "vctk": "[VCTK dataset path]"
55
+ },
56
+ ```
57
+
58
+ ## 2. Features Extraction
59
+
60
+ ### Content-based Pretrained Models Download
61
+
62
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
+
64
+ ### Configuration
65
+
66
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
+
68
+ ```json
69
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
+ "log_dir": "ckpts/svc",
71
+ "preprocess": {
72
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
73
+ "processed_dir": "data",
74
+ ...
75
+ },
76
+ ```
77
+
78
+ ### Run
79
+
80
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
81
+
82
+ ```bash
83
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
+ ```
85
+
86
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
+
88
+ ## 3. Training
89
+
90
+ ### Configuration
91
+
92
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
+
94
+ ```json
95
+ "train": {
96
+ "batch_size": 32,
97
+ ...
98
+ "adamw": {
99
+ "lr": 2.0e-4
100
+ },
101
+ ...
102
+ }
103
+ ```
104
+
105
+ ### Run
106
+
107
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
+
109
+ ```bash
110
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
+ ```
112
+
113
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
+
115
+ ## 4. Inference/Conversion
116
+
117
+ ### Pretrained Vocoder Download
118
+
119
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
+
121
+ ### Run
122
+
123
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
+
125
+ | Parameters | Description | Example |
126
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
+
133
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
+
135
+ ```bash
136
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
+ --infer_source_audio_dir [Your Audios Folder] \
140
+ --infer_target_speaker "opencpop_female1" \
141
+ --infer_key_shift "autoshift"
142
+ ```
143
+
144
+ ## Citations
145
+
146
+ ```bibtex
147
+ @article{zhang2023leveraging,
148
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
+ journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
+ year={2023}
152
+ }
153
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "diffusion": {
65
+ "scheduler": "ddpm",
66
+ "scheduler_settings": {
67
+ "num_train_timesteps": 1000,
68
+ "beta_start": 1.0e-4,
69
+ "beta_end": 0.02,
70
+ "beta_schedule": "linear"
71
+ },
72
+ // Diffusion steps encoder
73
+ "step_encoder": {
74
+ "dim_raw_embedding": 128,
75
+ "dim_hidden_layer": 512,
76
+ "activation": "SiLU",
77
+ "num_layer": 2,
78
+ "max_period": 10000
79
+ },
80
+ // Diffusion decoder
81
+ "model_type": "bidilconv",
82
+ // bidilconv, unet2d, TODO: unet1d
83
+ "bidilconv": {
84
+ "base_channel": 512,
85
+ "n_res_block": 40,
86
+ "conv_kernel_size": 3,
87
+ "dilation_cycle_length": 4,
88
+ // specially, 1 means no dilation
89
+ "conditioner_size": 384
90
+ }
91
+ }
92
+ },
93
+ "train": {
94
+ "batch_size": 32,
95
+ "gradient_accumulation_step": 1,
96
+ "max_epoch": -1, // -1 means no limit
97
+ "save_checkpoint_stride": [
98
+ 3,
99
+ 50
100
+ ],
101
+ "keep_last": [
102
+ 3,
103
+ 2
104
+ ],
105
+ "run_eval": [
106
+ true,
107
+ true
108
+ ],
109
+ "adamw": {
110
+ "lr": 2.0e-4
111
+ },
112
+ "reducelronplateau": {
113
+ "factor": 0.8,
114
+ "patience": 30,
115
+ "min_lr": 1.0e-4
116
+ },
117
+ "dataloader": {
118
+ "num_worker": 8,
119
+ "pin_memory": true
120
+ },
121
+ "sampler": {
122
+ "holistic_shuffle": false,
123
+ "drop_last": true
124
+ }
125
+ }
126
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/TransformerSVC/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformer for Singing Voice Conversion
2
+
3
+ This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/TransformerSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+ Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
79
+ ```json
80
+ "model": {
81
+ ...
82
+ "transformer":{
83
+ // 'conformer' or 'transformer'
84
+ "type": "conformer",
85
+ "input_dim": 384,
86
+ "output_dim": 100,
87
+ "n_heads": 2,
88
+ "n_layers": 6,
89
+ "filter_channels":512,
90
+ "dropout":0.1,
91
+ }
92
+ }
93
+ ```
94
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
95
+
96
+ ```json
97
+ "train": {
98
+ "batch_size": 32,
99
+ ...
100
+ "adamw": {
101
+ "lr": 2.0e-4
102
+ },
103
+ ...
104
+ }
105
+ ```
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
110
+
111
+ ```bash
112
+ sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
113
+ ```
114
+
115
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
116
+
117
+ ## 4. Inference/Conversion
118
+
119
+ ### Pretrained Vocoder Download
120
+
121
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
122
+
123
+ ### Run
124
+
125
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
126
+
127
+ | Parameters | Description | Example |
128
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
130
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
131
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
132
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
133
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
134
+
135
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
136
+
137
+ ```bash
138
+ cd Amphion
139
+ sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
140
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
141
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
142
+ --infer_source_audio_dir [Your Audios Folder] \
143
+ --infer_target_speaker "opencpop_female1" \
144
+ --infer_key_shift "autoshift"
145
+ ```
146
+
147
+ ## Citations
148
+
149
+ ```bibtex
150
+ @inproceedings{transformer,
151
+ author = {Ashish Vaswani and
152
+ Noam Shazeer and
153
+ Niki Parmar and
154
+ Jakob Uszkoreit and
155
+ Llion Jones and
156
+ Aidan N. Gomez and
157
+ Lukasz Kaiser and
158
+ Illia Polosukhin},
159
+ title = {Attention is All you Need},
160
+ booktitle = {{NIPS}},
161
+ pages = {5998--6008},
162
+ year = {2017}
163
+ }
164
+ ```
egs/svc/TransformerSVC/exp_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/transformer.json",
3
+ "model_type": "TransformerSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "transformer": {
65
+ // 'conformer' or 'transformer'
66
+ "type": "conformer",
67
+ "input_dim": 384,
68
+ "output_dim": 100,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "filter_channels": 512,
72
+ "dropout": 0.1,
73
+ }
74
+ },
75
+ "train": {
76
+ "batch_size": 64,
77
+ "gradient_accumulation_step": 1,
78
+ "max_epoch": -1, // -1 means no limit
79
+ "save_checkpoint_stride": [
80
+ 50,
81
+ 50
82
+ ],
83
+ "keep_last": [
84
+ 5,
85
+ -1
86
+ ],
87
+ "run_eval": [
88
+ false,
89
+ true
90
+ ],
91
+ "adamw": {
92
+ "lr": 4.0e-4
93
+ },
94
+ "reducelronplateau": {
95
+ "factor": 0.8,
96
+ "patience": 10,
97
+ "min_lr": 1.0e-4
98
+ },
99
+ "dataloader": {
100
+ "num_worker": 8,
101
+ "pin_memory": true
102
+ },
103
+ "sampler": {
104
+ "holistic_shuffle": false,
105
+ "drop_last": true
106
+ }
107
+ }
108
+ }
egs/svc/TransformerSVC/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/tta/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Text-to-Audio (TTA) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ Until now, Amphion has supported a latent diffusion based text-to-audio model:
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
14
+ </div>
15
+ <br>
16
+
17
+ Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
18
+ 1. Training the VAE which is called `AutoencoderKL` in Amphion.
19
+ 2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.
egs/tta/RECIPE.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text-to-Audio with Latent Diffusion Model
2
+
3
+ This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
4
+
5
+ <br>
6
+ <div align="center">
7
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
8
+ </div>
9
+ <br>
10
+
11
+ We train this latent diffusion model in two stages:
12
+ 1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
13
+ the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
14
+ 1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
15
+
16
+ There are four stages in total for training the text-to-audio model:
17
+
18
+ 1. Data preparation and processing
19
+ 2. Train the VAE model
20
+ 3. Train the latent diffusion model
21
+ 4. Inference
22
+
23
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
24
+ > ```bash
25
+ > cd Amphion
26
+ > ```
27
+
28
+ ## Overview
29
+
30
+ ```sh
31
+ # Train the VAE model
32
+ sh egs/tta/autoencoderkl/run_train.sh
33
+
34
+ # Train the latent diffusion model
35
+ sh egs/tta/audioldm/run_train.sh
36
+
37
+ # Inference
38
+ sh egs/tta/audioldm/run_inference.sh
39
+ ```
40
+
41
+ ## 1. Data preparation and processing
42
+
43
+ ### Dataset Download
44
+
45
+ We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
46
+
47
+ <!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
48
+ <!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
49
+
50
+ ### Data Processing
51
+
52
+ - Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
53
+
54
+ ```json
55
+ {
56
+ "dataset": [
57
+ "AudioCaps"
58
+ ],
59
+ "preprocess": {
60
+ // Specify the output root path to save the processed data
61
+ "processed_dir": "[Your path to save tta dataset]",
62
+ ...
63
+ }
64
+ }
65
+ ```
66
+
67
+ The folder structure of your downloaded data should be similar to:
68
+
69
+ ```plaintext
70
+ .../[Your path to save tta dataset]
71
+ ┣ AudioCpas
72
+ ┃   ┣ wav
73
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
74
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
75
+ ┃ ┃ ┣ ...
76
+ ```
77
+
78
+ - Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
79
+
80
+ - Generate a json file to save the metadata, the json file is like:
81
+
82
+ ```json
83
+ [
84
+ {
85
+ "Dataset": "AudioCaps",
86
+ "Uid": "---1_cCGK4M_0_10000",
87
+ "Caption": "Idling car, train blows horn and passes"
88
+ },
89
+ {
90
+ "Dataset": "AudioCaps",
91
+ "Uid": "---lTs1dxhU_30000_40000",
92
+ "Caption": "A racing vehicle engine is heard passing by"
93
+ },
94
+ ...
95
+ ]
96
+ ```
97
+ - Finally, the folder structure is like:
98
+
99
+ ```plaintext
100
+ .../[Your path to save tta dataset]
101
+ ┣ AudioCpas
102
+ ┃   ┣ wav
103
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
104
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
105
+ ┃ ┃ ┣ ...
106
+ ┃   ┣ mel
107
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.npy
108
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy
109
+ ┃ ┃ ┣ ...
110
+ ┃   ┣ train.json
111
+ ┃   ┣ valid.json
112
+ ┃   ┣ ...
113
+ ```
114
+
115
+ ## 2. Training the VAE Model
116
+
117
+ The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
118
+
119
+ ```sh
120
+ sh egs/tta/autoencoderkl/run_train.sh
121
+ ```
122
+
123
+ ## 3. Training the Latent Diffusion Model
124
+
125
+ The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
126
+
127
+ ```sh
128
+ sh egs/tta/audioldm/run_train.sh
129
+ ```
130
+
131
+ ## 4. Inference
132
+
133
+ Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
134
+
135
+ ```sh
136
+ sh egs/tta/audioldm/run_inference.sh \
137
+ --text "A man is whistling"
138
+ ```
139
+
140
+ ## Citations
141
+
142
+ ```bibtex
143
+ @article{wang2023audit,
144
+ title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
145
+ author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
146
+ journal={NeurIPS 2023},
147
+ year={2023}
148
+ }
149
+
150
+ @article{liu2023audioldm,
151
+ title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
152
+ author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
153
+ journal={Proceedings of the International Conference on Machine Learning},
154
+ year={2023}
155
+ }
156
+ ```
egs/tta/audioldm/exp_config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+ // For example: "/home/TTADataset/processed_data"
10
+
11
+ // feature
12
+ "use_spkid": false,
13
+ "use_uv": false,
14
+ "use_frame_pitch": false,
15
+ "use_phone_pitch": false,
16
+ "use_frame_energy": false,
17
+ "use_phone_energy": false,
18
+ "use_mel": false,
19
+ "use_audio": false,
20
+ "use_label": false,
21
+ "use_one_hot": false,
22
+ // feature for text to audio
23
+ "use_caption": true,
24
+ "use_melspec": true,
25
+ "use_wav": false,
26
+ // feature dir
27
+ "melspec_dir": "mel",
28
+ "wav_dir": "wav"
29
+ },
30
+ // Specify the output root path to save model ckpts and logs
31
+ "log_dir": "ckpts/tta",
32
+ // For example: "/home/TTADataset/processed_data/logs"
33
+
34
+ // model
35
+ "model": {
36
+ "audioldm": {
37
+ "image_size": 32,
38
+ "in_channels": 4,
39
+ "out_channels": 4,
40
+ "model_channels": 256,
41
+ "attention_resolutions": [4, 2, 1],
42
+ "num_res_blocks": 2,
43
+ "channel_mult": [1, 2, 4],
44
+ "num_heads": 8,
45
+ "use_spatial_transformer": true,
46
+ "transformer_depth": 1,
47
+ "context_dim": 768,
48
+ "use_checkpoint": true,
49
+ "legacy": false
50
+ },
51
+ "autoencoderkl": {
52
+ "ch": 128,
53
+ "ch_mult": [1,1,2,2,4],
54
+ "num_res_blocks": 2,
55
+ "in_channels": 1,
56
+ "z_channels": 4,
57
+ "out_ch": 1,
58
+ "double_z": true
59
+ },
60
+ "noise_scheduler": {
61
+ "num_train_timesteps": 1000,
62
+ "beta_start": 0.00085,
63
+ "beta_end": 0.012,
64
+ "beta_schedule": "scaled_linear",
65
+ "clip_sample": false,
66
+ "steps_offset": 1,
67
+ "set_alpha_to_one": false,
68
+ "skip_prk_steps": true,
69
+ "prediction_type": "epsilon"
70
+ },
71
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
72
+ },
73
+
74
+ // train
75
+ "train": {
76
+ "adam": {
77
+ "lr": 5.0e-5
78
+ },
79
+ "ddp": false,
80
+ "random_seed": 12345,
81
+ "batch_size": 12,
82
+ "epochs": 50000,
83
+ "max_steps": 1000000,
84
+ "total_training_steps": 800000,
85
+ "save_summary_steps": 1000,
86
+ "save_checkpoints_steps": 5000,
87
+ "valid_interval": 5000,
88
+ "keep_checkpoint_max": 100
89
+ }
90
+ }
egs/tta/audioldm/exp_config_base.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/audioldm.json",
3
+ "model_type": "AudioLDM",
4
+ "dataset": [
5
+ "AudioCaps"
6
+ ],
7
+ "preprocess": {
8
+ "train_file": "train.json",
9
+ "valid_file": "vaild.json"
10
+ }
11
+ }
egs/tta/audioldm/exp_config_latent_4_10_78.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spkid": false,
12
+ "use_uv": false,
13
+ "use_frame_pitch": false,
14
+ "use_phone_pitch": false,
15
+ "use_frame_energy": false,
16
+ "use_phone_energy": false,
17
+ "use_mel": false,
18
+ "use_audio": false,
19
+ "use_label": false,
20
+ "use_one_hot": false,
21
+ // feature for text to audio
22
+ "use_caption": true,
23
+ "use_melspec": true,
24
+ "use_wav": false,
25
+ // feature dir
26
+ "melspec_dir": "mel",
27
+ "wav_dir": "wav"
28
+ },
29
+ // Specify the output root path to save model ckpts and logs
30
+ "log_dir": "ckpts/tta",
31
+
32
+ // model
33
+ "model": {
34
+ "audioldm": {
35
+ "image_size": 32,
36
+ "in_channels": 4,
37
+ "out_channels": 4,
38
+ "model_channels": 256,
39
+ "attention_resolutions": [4, 2, 1],
40
+ "num_res_blocks": 2,
41
+ "channel_mult": [1, 2, 4],
42
+ "num_heads": 8,
43
+ "use_spatial_transformer": true,
44
+ "transformer_depth": 1,
45
+ "context_dim": 768,
46
+ "use_checkpoint": true,
47
+ "legacy": false
48
+ },
49
+ "autoencoderkl": {
50
+ "ch": 128,
51
+ "ch_mult": [1,2,2,4],
52
+ "num_res_blocks": 2,
53
+ "in_channels": 1,
54
+ "z_channels": 4,
55
+ "out_ch": 1,
56
+ "double_z": true
57
+ },
58
+ "noise_scheduler": {
59
+ "num_train_timesteps": 1000,
60
+ "beta_start": 0.00085,
61
+ "beta_end": 0.012,
62
+ "beta_schedule": "scaled_linear",
63
+ "clip_sample": false,
64
+ "steps_offset": 1,
65
+ "set_alpha_to_one": false,
66
+ "skip_prk_steps": true,
67
+ "prediction_type": "epsilon"
68
+ },
69
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
70
+ },
71
+
72
+ // train
73
+ "train": {
74
+ "adam": {
75
+ "lr": 2.0e-5
76
+ },
77
+ "ddp": false,
78
+ "random_seed": 12345,
79
+ "batch_size": 12,
80
+ "epochs": 50000,
81
+ "max_steps": 1000000,
82
+ "total_training_steps": 800000,
83
+ "save_summary_steps": 1000,
84
+ "save_checkpoints_steps": 5000,
85
+ "valid_interval": 5000,
86
+ "keep_checkpoint_max": 100
87
+ }
88
+ }
egs/tta/audioldm/run_inference.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="$text" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir
egs/tta/audioldm/run_inference_latent_4_10_78.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_v2.json"
16
+ exp_name="audioldm_debug_latent_size_4_10_78"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="A man is whistling" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir \
egs/tta/audioldm/run_train.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \