Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +46 -3
config.json +21 -0
filebrowser.db +0 -0
firefly-gan-vq-fsq-4x1024-42hz-generator.pth +3 -0
model.pth +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +82 -0

README.md CHANGED Viewed

@@ -1,3 +1,46 @@
----
-license: afl-3.0
----

+---
+tags:
+- text-to-speech
+license: cc-by-nc-sa-4.0
+language:
+- en
+- zh
+- ja
+pipeline_tag: text-to-speech
+inference: false
+extra_gated_prompt: >-
+  You agree to not use the model to generate contents that violate DMCA or local
+  laws.
+extra_gated_fields:
+  Country: country
+  Specific date: date_picker
+  I agree to use this model for non-commercial use ONLY: checkbox
+---
+# Fish Speech V1.2
+**Fish Speech V1.2** is a leading text-to-speech (TTS) model trained on 300k hours of English, Chinese, and Japanese audio data.
+Please refer to [Fish Speech Github](https://github.com/fishaudio/fish-speech) for more info.
+Demo available at [Fish Audio](https://fish.audio/).
+## Citation
+If you found this repository useful, please consider citing this work:
+```
+@misc{fish-speech-v1,
+  author = {Shijia Liao, Tianyu Li},
+  title = {Fish Speech V1},
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/fishaudio/fish-speech}}
+}
+```
+## License
+This model is permissively licensed under the BY-CC-NC-SA-4.0 license.
+The source code is released under BSD-3-Clause license.

config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_qkv_bias": false,
+    "codebook_size": 1024,
+    "dim": 1024,
+    "dropout": 0.1,
+    "head_dim": 64,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "max_seq_len": 4096,
+    "model_type": "dual_ar",
+    "n_fast_layer": 4,
+    "n_head": 16,
+    "n_layer": 24,
+    "n_local_heads": 2,
+    "norm_eps": 1e-06,
+    "num_codebooks": 4,
+    "rope_base": 1000000.0,
+    "tie_word_embeddings": false,
+    "use_gradient_checkpointing": true,
+    "vocab_size": 32000
+}

filebrowser.db ADDED Viewed

Binary file (65.5 kB). View file

firefly-gan-vq-fsq-4x1024-42hz-generator.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c64a1054cd6aea91565b409d4f19338c79fe0a8b63fb75d2f0eb0f252f1f7b43
+size 167393289

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d22f01a8c9092582fa27e8c04767a764fc55a2adb30bab059e1eaa12df01f4e7
+size 980602750

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_sequence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_sequence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|begin_of_sequence|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|end_of_sequence|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|semantic|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<|mel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<|phoneme_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<|phoneme_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_sequence|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_sequence|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}