E2-F5-TTS

Sleeping

App Files Files Community

mrfakename commited on Oct 15, 2024

Commit

c154a29

verified ·

1 Parent(s): 80a2615

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (3) hide show

model/dataset.py +18 -2
model/utils.py +7 -0
train.py +7 -4

model/dataset.py CHANGED Viewed

@@ -184,10 +184,15 @@ class DynamicBatchSampler(Sampler[list[int]]):
 def load_dataset(
         dataset_name: str,
-        tokenizer: str,
         dataset_type: str = "CustomDataset",
         audio_type: str = "raw",
         mel_spec_kwargs: dict = dict()
         ) -> CustomDataset:
     print("Loading dataset ...")
@@ -206,7 +211,18 @@ def load_dataset(
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs)
     elif dataset_type == "HFDataset":
         print("Should manually modify the path of huggingface dataset to your need.\n" +
               "May also the corresponding script cuz different dataset may have different format.")

 def load_dataset(
         dataset_name: str,
+        tokenizer: str = "pinyon",
         dataset_type: str = "CustomDataset",
         audio_type: str = "raw",
         mel_spec_kwargs: dict = dict()
+        ) -> CustomDataset | HFDataset:
+    '''
+    dataset_type    - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
+                    - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
+    '''
         ) -> CustomDataset:
     print("Loading dataset ...")
             data_dict = json.load(f)
         durations = data_dict["duration"]
         train_dataset = CustomDataset(train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs)
+    elif dataset_type == "CustomDatasetPath":
+        try:
+            train_dataset = load_from_disk(f"{dataset_name}/raw")
+        except:
+            train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
+        with open(f"{dataset_name}/duration.json", 'r', encoding='utf-8') as f:
+            data_dict = json.load(f)
+        durations = data_dict["duration"]
+        train_dataset = CustomDataset(train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs)
     elif dataset_type == "HFDataset":
         print("Should manually modify the path of huggingface dataset to your need.\n" +
               "May also the corresponding script cuz different dataset may have different format.")

model/utils.py CHANGED Viewed

@@ -129,6 +129,7 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
     tokenizer   - "pinyin" do g2p for only chinese characters, need .txt vocab_file
                 - "char" for char-wise tokenizer, need .txt vocab_file
                 - "byte" for utf-8 tokenizer
     vocab_size  - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
                 - if use "char", derived from unfiltered character & symbol counts of custom dataset
                 - if use "byte", set to 256 (unicode byte range)
@@ -144,6 +145,12 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
     elif tokenizer == "byte":
         vocab_char_map = None
         vocab_size = 256
     return vocab_char_map, vocab_size

     tokenizer   - "pinyin" do g2p for only chinese characters, need .txt vocab_file
                 - "char" for char-wise tokenizer, need .txt vocab_file
                 - "byte" for utf-8 tokenizer
+                - "custom" if you're directly passing in a path to the vocab.txt you want to use
     vocab_size  - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
                 - if use "char", derived from unfiltered character & symbol counts of custom dataset
                 - if use "byte", set to 256 (unicode byte range)
     elif tokenizer == "byte":
         vocab_char_map = None
         vocab_size = 256
+    elif tokenizer == "custom":
+        with open (dataset_name, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
     return vocab_char_map, vocab_size

train.py CHANGED Viewed

@@ -9,10 +9,10 @@ target_sample_rate = 24000
 n_mel_channels = 100
 hop_length = 256
-tokenizer = "pinyin"
 dataset_name = "Emilia_ZH_EN"
 # -------------------------- Training Settings -------------------------- #
 exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
@@ -44,8 +44,11 @@ elif exp_name == "E2TTS_Base":
 # ----------------------------------------------------------------------- #
 def main():
-    vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
     mel_spec_kwargs = dict(
             target_sample_rate = target_sample_rate,

 n_mel_channels = 100
 hop_length = 256
+tokenizer = "pinyin" # 'pinyin', 'char', or 'custom'
+tokenizer_path = None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
 dataset_name = "Emilia_ZH_EN"
 # -------------------------- Training Settings -------------------------- #
 exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
 # ----------------------------------------------------------------------- #
 def main():
+    if tokenizer == "custom":
+        tokenizer_path = tokenizer_path
+    else:
+        tokenizer_path = dataset_name
+    vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
     mel_spec_kwargs = dict(
             target_sample_rate = target_sample_rate,