ml-for-speech
/

styletts-vc

Model card Files Files and versions Community

mrfakename commited on Apr 19, 2024

Commit

9640c47

verified ·

1 Parent(s): 9df0fee

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitattributes +1 -0
Models/VCTK/config.yml +59 -0
Models/VCTK/epoch_2nd_00100.pth +3 -0
Vocoder/LibriTTS/config.json +38 -0
Vocoder/LibriTTS/g_00935000 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Vocoder/LibriTTS/g_00935000 filter=lfs diff=lfs merge=lfs -text

Models/VCTK/config.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+log_dir: "Models/VCTK"
+first_stage_path: "first_stage.pth"
+save_freq: 2
+log_interval: 10
+device: "cuda"
+multigpu: false
+epochs_1st: 150 # number of epochs for first stage training
+epochs_2nd: 100 # number of peochs for second stage training
+batch_size: 32
+pretrained_model: ""
+second_stage_load_pretrained: false # set to true if the pre-trained model is for 2nd stage
+load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
+train_data: "Data/train_list.txt"
+val_data: "Data/val_list.txt"
+F0_path: "Utils/JDC/bst.t7"
+ASR_config: "Utils/ASR/config.yml"
+ASR_path: "Utils/ASR/epoch_00080.pth"
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  hidden_dim: 512
+  n_token: 178
+  style_dim: 128
+  n_layer: 3
+  dim_in: 64
+  max_conv_dim: 512
+  n_mels: 80
+  dropout: 0.2
+  n_domain: 108
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss (1st & 2nd stage)
+    lambda_adv: 1. # adversarial loss (1st & 2nd stage)
+    lambda_reg: 1. # adversarial regularization loss (1st & 2nd stage)
+    lambda_fm: 0.1 # feature matching loss (1st & 2nd stage)
+    lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
+    lambda_sty: 1. # style reconstruction loss (1st stage)
+    TMA_epoch: 10 # TMA starting epoch (1st stage)
+    VC_epoch: -5 # VC starting epoch (1st stage), only fine-tune for VC for 5 epochs
+    TMA_CEloss: false # see https://github.com/yl4579/StyleTTS/issues/7
+    lambda_feat: 1. # feature reconstruction loss (1st stage)
+    lambda_pim: 1. # phoneme information maximazation loss (2nd stage)
+    lambda_cyc: 1. # cycle consistency loss (2nd stage)
+optimizer_params:
+  lr: 0.0001

Models/VCTK/epoch_2nd_00100.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:841dd6a36943d7bc6c127b9f1857db920eef59da5658108cb5751dac94143457
+size 335469173

Vocoder/LibriTTS/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "freeze_level": 2,
+    "upsample_rates": [10,5,3,2],
+    "upsample_kernel_sizes": [20,10,6,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 57600,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 2048,
+    "hop_size": 300,
+    "win_size": 1200,
+    "sampling_rate": 24000,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

Vocoder/LibriTTS/g_00935000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:349789d9c2ed411b564223093d3fe54de09be0ded8d19a16e011e828d547c4a3
+size 57205417