File size: 3,767 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
{
"base_config": "config/base.json",
"task_type": "svc",
"preprocess": {
// data augmentations
"use_pitch_shift": false,
"use_formant_shift": false,
"use_time_stretch": false,
"use_equalizer": false,
// Online or offline features extraction ("offline" or "online")
"features_extraction_mode": "offline",
// acoustic features
"extract_mel": true,
"mel_min_max_norm": true,
"extract_pitch": true,
"pitch_extractor": "parselmouth",
"extract_uv": true,
"extract_energy": true,
// content features
"extract_whisper_feature": false,
"whisper_sample_rate": 16000,
"extract_contentvec_feature": false,
"contentvec_sample_rate": 16000,
"extract_wenet_feature": false,
"wenet_sample_rate": 16000,
"extract_mert_feature": false,
"mert_sample_rate": 16000,
// Default config for whisper
"whisper_frameshift": 0.01,
"whisper_downsample_rate": 2,
// Default config for content vector
"contentvec_frameshift": 0.02,
// Default config for mert
"mert_model": "m-a-p/MERT-v1-330M",
"mert_feature_layer": -1,
"mert_hop_size": 320,
// 24k
"mert_frameshit": 0.01333,
// 10ms
"wenet_frameshift": 0.01,
// wenetspeech is 4, gigaspeech is 6
"wenet_downsample_rate": 4,
// Default config
"n_mel": 100,
"win_size": 1024,
// todo
"hop_size": 256,
"sample_rate": 24000,
"n_fft": 1024,
// todo
"fmin": 0,
"fmax": 12000,
// todo
"f0_min": 50,
// ~C2
"f0_max": 1100,
//1100, // ~C6(1100), ~G5(800)
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"is_label": true,
"is_mu_law": true,
"bits": 8,
"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"contentvec_dir": "contentvec",
"wenet_dir": "wenet",
"mert_dir": "mert",
// Extract content features using dataloader
"pin_memory": true,
"num_workers": 8,
"content_feature_batch_size": 16,
// Features used for model training
"use_mel": true,
"use_min_max_norm_mel": true,
"use_frame_pitch": true,
"use_uv": true,
"use_interpolation_for_uv": false,
"use_frame_energy": true,
"use_log_scale_pitch": false,
"use_log_scale_energy": false,
"use_spkid": true,
// Meta file
"train_file": "train.json",
"valid_file": "test.json",
"spk2id": "singers.json",
"utt2spk": "utt2singer"
},
"model": {
"condition_encoder": {
"merge_mode": "add",
// Prosody Features
"use_f0": true,
"use_uv": true,
"use_energy": true,
// Quantization (0 for not quantization)
"input_melody_dim": 1,
"n_bins_melody": 256,
"output_melody_dim": 384,
"input_loudness_dim": 1,
"n_bins_loudness": 256,
"output_loudness_dim": 384,
// Semantic Features
"use_whisper": false,
"use_contentvec": false,
"use_wenet": false,
"use_mert": false,
"whisper_dim": 1024,
"contentvec_dim": 256,
"mert_dim": 256,
"wenet_dim": 512,
"content_encoder_dim": 384,
// Speaker Features
"output_singer_dim": 384,
"singer_table_size": 512,
"use_spkid": true
}
},
} |