clone

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +70 -0
config.yaml +30 -0
fbank_mfa_gcmvn_stats.npz +3 -0
hifigan.bin +3 -0
hifigan.json +37 -0
pytorch_model.pt +3 -0
vocab.txt +71 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fbank_mfa_gcmvn_stats.npz filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+---
+library_name: fairseq
+task: text-to-speech
+tags:
+- fairseq
+- audio
+- text-to-speech
+language: en
+datasets:
+- ljspeech
+widget:
+- text: "Hello, this is a test run."
+  example_title: "Hello, this is a test run."
+---
+# fastspeech2-en-ljspeech
+[FastSpeech 2](https://arxiv.org/abs/2006.04558) text-to-speech model from fairseq S^2 ([paper](https://arxiv.org/abs/2109.06912)/[code](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis)):
+- English
+- Single-speaker female voice
+- Trained on [LJSpeech](https://keithito.com/LJ-Speech-Dataset/)
+## Usage
+```python
+from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+import IPython.display as ipd
+models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
+    "facebook/fastspeech2-en-ljspeech",
+    arg_overrides={"vocoder": "hifigan", "fp16": False}
+)
+model = models[0]
+TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
+generator = task.build_generator(model, cfg)
+text = "Hello, this is a test run."
+sample = TTSHubInterface.get_model_input(task, text)
+wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
+ipd.Audio(wav, rate=rate)
+```
+See also [fairseq S^2 example](https://github.com/pytorch/fairseq/blob/main/examples/speech_synthesis/docs/ljspeech_example.md).
+## Citation
+```bibtex
+@inproceedings{wang-etal-2021-fairseq,
+    title = "fairseq S{\^{}}2: A Scalable and Integrable Speech Synthesis Toolkit",
+    author = "Wang, Changhan  and
+      Hsu, Wei-Ning  and
+      Adi, Yossi  and
+      Polyak, Adam  and
+      Lee, Ann  and
+      Chen, Peng-Jen  and
+      Gu, Jiatao  and
+      Pino, Juan",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-demo.17",
+    doi = "10.18653/v1/2021.emnlp-demo.17",
+    pages = "143--152",
+}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+features:
+  energy_max: 3.2244551181793213
+  energy_min: -4.9544901847839355
+  eps: 1.0e-05
+  f_max: 8000
+  f_min: 0
+  hop_len_t: 0.011609977324263039
+  hop_length: 256
+  n_fft: 1024
+  n_mels: 80
+  n_stft: 513
+  pitch_max: 5.733940816898645
+  pitch_min: -4.660287183665281
+  sample_rate: 22050
+  type: spectrogram+melscale+log
+  win_len_t: 0.046439909297052155
+  win_length: 1024
+  window_fn: hann
+global_cmvn:
+  stats_npz_path: fbank_mfa_gcmvn_stats.npz
+transforms:
+  '*':
+  - global_cmvn
+vocab_filename: vocab.txt
+vocoder:
+  type: hifigan
+  config: hifigan.json
+  checkpoint: hifigan.bin
+hub:
+  phonemizer: g2p

fbank_mfa_gcmvn_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6735b35875c2614cee80bf861c6a604aba35671887e6f04b4449dc257bb15d34
+size 1140

hifigan.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f4f016c791fd9ca9859a9e25e7eb0a823fee2ea997c1e5ae8e1a9ea5f99b1f
+size 55825897

hifigan.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a48d454fe66939079d0ddb70f1c062ec669f521a7cfadc608968746e312986ab
+size 494816801

vocab.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+AH0 71007
+N 63410
+T 60842
+S 40263
+D 39886
+R 35965
+L 30358
+sp 27584
+IH0 27113
+DH 26584
+K 25851
+IH1 25683
+Z 25387
+EH1 21690
+AE1 21648
+M 21537
+W 18760
+P 18458
+ER0 18446
+V 18169
+IY0 17832
+AH1 16995
+F 15549
+B 14227
+HH 13468
+IY1 12751
+EY1 12141
+AO1 11595
+AA1 10589
+AY1 9624
+UW1 8865
+SH 7449
+OW1 7441
+NG 6705
+G 5472
+ER1 4898
+Y 4548
+JH 4486
+CH 4355
+TH 3980
+AW1 3607
+UH1 2469
+EH2 1881
+spn 1774
+AO0 1357
+OW0 1328
+EY2 1258
+IH2 1251
+AE2 1104
+UW0 1077
+AY2 1062
+AA2 774
+OY1 771
+AO2 622
+ZH 587
+EH0 568
+OW2 557
+EY0 443
+IY2 435
+UW2 431
+AY0 390
+AE0 374
+AH2 316
+AW2 290
+AA0 259
+ER2 136
+UH2 127
+OY2 44
+UH0 36
+AW0 35
+OY0 4