In [None]:
import h5py
import glob
import torch
import os
import torchaudio
import shutil
import numpy as np
import soundfile as sf
from utils.g2p import PhonemeBpeTokenizer
from utils.prompt_making import make_transcript
from data.collation import get_text_token_collater
from tqdm.notebook import tqdm

```
MyTTSDataset/train
├── bpe_69.json
├── wav1
  └── 1-1.wav
  └── 1-2.wav
  └── audio_ann_sum.txt
  └── audio_sum.hdf5
├── wav2
  └── 2-1.wav
  └── 2-2.wav
  └── audio_ann_sum.txt
  └── audio_sum.hdf5
......
└── wav{n}
```

In [2]:
# WAV長度處理
def load_trim_and_save_audio(input_file_path, output_file_path, max_length=15.0):
    waveform, sample_rate = torchaudio.load(input_file_path)
    
    # Calculate the number of samples that correspond to max_length seconds
    max_samples = int(max_length * sample_rate)
    
    # Get the total number of samples in the audio file
    num_samples = waveform.size(-1)
    
    if num_samples > max_samples:
        # If the audio is longer than max_length seconds, trim it
        waveform = waveform[:, :max_samples]
        # Save the waveform back to a file
        torchaudio.save(output_file_path, waveform, sample_rate)

wav_files = glob('MyTTSDataset/train/*wav')
for wav_file in tqdm(wav_files):
    load_trim_and_save_audio(wav_file,wav_file)


In [82]:
# Mappings from symbol to numeric ID and vice versa:
from data.tokenizer import (
    AudioTokenizer,
    tokenize_audio,
)

tokenizer_path = "./utils/g2p/bpe_69.json"
tokenizer = PhonemeBpeTokenizer(tokenizer_path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def make_prompts(name, audio_prompt_path, transcript=None):
    text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
    text_collater = get_text_token_collater()
    codec = AudioTokenizer(device)
    wav_pr, sr = torchaudio.load(audio_prompt_path)
    # check length
    if wav_pr.size(-1) / sr > 15:
        raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.")
    if wav_pr.size(0) == 2:
        wav_pr = wav_pr.mean(0, keepdim=True)
    text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript)

    # tokenize audio
    encoded_frames = tokenize_audio(codec, (wav_pr, sr))
    audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()

    # tokenize text
    phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip())
    text_tokens, enroll_x_lens = text_collater(
        [
            phonemes
        ]
    )

    return audio_tokens, text_tokens, langs, text_pr
    
def create_dataset(data_dir, dataloader_process_only):
    if dataloader_process_only:
        h5_output_path=f"{data_dir}/audio_sum.hdf5"
        ann_output_path=f"{data_dir}/audio_ann_sum.txt"
        #audio_folder = os.path.join(data_dir, 'audio')
        audio_paths = glob.glob(f"{data_dir}/*.wav")  # Change this to match your audio file extension

        # Create or open an HDF5 file
        with h5py.File(h5_output_path, 'w') as h5_file:
            # Loop through each audio and text file, assuming they have the same stem
            for audio_path in audio_paths:
                try:
                    stem = os.path.splitext(os.path.basename(audio_path))[0]
                    audio_tokens, text_tokens, langs, text = make_prompts(data_dir=data_dir, name=stem, audio_prompt_path=audio_path)
                    
                    text_tokens = text_tokens.squeeze(0)
                    # Create a group for each stem
                    grp = h5_file.create_group(stem)
                    # Add audio and text tokens as datasets to the group
                    grp.create_dataset('audio', data=audio_tokens)
                    #grp.create_dataset('text', data=text_tokens)
                    
                    with open(ann_output_path, 'a', encoding='utf-8') as ann_file:
                        audio, sample_rate = sf.read(audio_path)
                        duration = len(audio) / sample_rate
                        ann_file.write(f'{stem}|{duration}|{langs[0]}|{text}\n')  # 改行を追加
                except Exception as e:
                    print(f"An error occurred: {e}")

In [3]:
basedir = "MyTTSDataset/train"
folds = os.listdir(basedir)
for fold in folds:
    print('>>>>',fold)
    data_dir = os.path.join(basedir,fold)
    create_dataset(data_dir, dataloader_process_only=True)

>>>> 08-13
Detected language: zh
跟大陸文革的結束都開始了一個新的從新去反省以及去醒示近代文學的一個階段


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BBS\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\BBS\AppData\Local\Temp\jieba.cache
Loading model cost 0.364 seconds.
DEBUG:jieba:Loading model cost 0.364 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Detected language: zh
因為張二林後來也很自然在他非常小的時候一方面讀古典文學
Detected language: zh
一方面她的英文極好她等於是西方的英語教育裡面出來的人
Detected language: zh
所以我想 我們就看他住在上海 這種租界的區域當中他受到的影響
Detected language: zh
是非常複雜的華洋雜處的一種文學影響
Detected language: zh
展露了这个才能之后她就投稿到报纸上去参加征文比赛
Detected language: zh
得到名次 得到獎金我們看到他曾經講一個很有趣的故事
Detected language: zh
他的母親說你寫作得獎了那你要不要去買一些世界文學明珠
Detected language: zh
表示你將來是要走作家這條路他說他沒有照他母親的話去做他偷偷跑出去
Detected language: zh
就去買了一支當時名牌的口紅我想這個世界裡面非常明顯透露出
Detected language: zh
張二林覺得女性的曖昧好像是一種本能他也覺得他喜歡寫作是一回事
Detected language: zh
所以我们都知道,在台湾曾经非常非常流行发生了非常大影响力的张爱玲的小说
Detected language: zh
可是還有一些東西可能比寫作更重要是要它要活出它生活裡的意思
Detected language: zh
一個獨特的風格所以我們在讀張愛玲的文學的時候我們會感覺到她真正關心的人
Detected language: zh
其实并不是文化里的人而是一些城市边缘
Detected language: zh
小資產階級對於自己的生活有著小小的愛恨的一些女性
Detected language: zh
我自己特別喜歡他在《請場之戀》裡面寫到的很多的女性
Detected language: zh
常常寫到他們怎麼去談戀愛,怎麼相親。然後怎麼樣去
Detected language: zh
緊張地面對自己接下來生命裡要面對的一個男人因為
Detected language: zh
因為這個相親不是一個普通的社交活動因為這個相親裡包含了他這一輩子
Detected language: zh
將來是幸福還是不幸福的一個過程所以在這種半開放的社會
Dete

Traceback (most recent call last):
  File "c:\Users\BBS\miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\BBS\AppData\Local\Temp\ipykernel_3852\1415381140.py", line 7, in <module>
    create_dataset(data_dir, dataloader_process_only=True)
  File "c:\Users\BBS\code\VALL-E-X-Trainer-by-CustomData\customs\make_custom_dataset.py", line 87, in create_dataset
  File "c:\Users\BBS\code\VALL-E-X-Trainer-by-CustomData\customs\make_custom_dataset.py", line 44, in make_prompts
    text_tokens, enroll_x_lens = text_collater(
  File "c:\Users\BBS\code\VALL-E-X-Trainer-by-CustomData\data\tokenizer.py", line 217, in __init__
    model = EncodecModel.encodec_model_24khz()
  File "C:\Users\BBS\AppData\Roaming\Python\Python39\site-packages\encodec\model.py", line 280, in encodec_model_24khz
    model.load_state_dict(state_dict)
  File "c:\Users\BBS\miniconda3\lib\site-packages\torch\nn\modules\mod

更改格式為
```bash
data_dir_OK
├── bpe_69.json
├── utt1-1.wav
├── utt1-2.wav
├── utt2-1.wav
......
└── utt{n}.wav
└── audio_ann_sum.txt
└── audio_sum.hdf5
```

In [None]:
data_dir = 'MyTTSDataset/train'
output_dir = data_dir + '_tune'

def merge_hdf5(files_to_merge, output_path):
    with h5py.File(output_path, 'w') as out_f:
        for file_path in tqdm(files_to_merge):
            with h5py.File(file_path, 'r') as f:
                for group in f:
                    if group in out_f:
                        if 'audio' in f[group] and 'audio' in out_f[group]:
                            data_existing = out_f[group]['audio'][:]
                            data_new = f[group]['audio'][:]
                            merged_data = np.concatenate((data_existing, data_new), axis=0)
                            del out_f[group]['audio']
                            out_f[group].create_dataset('audio', data=merged_data)
                    else:
                        out_f.copy(f[group], group)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

hdf5_files = []
audio_ann_content = ""

# Copy bpe_69.json
bpe_69_path = os.path.join(data_dir, 'bpe_69.json')
if os.path.exists(bpe_69_path):
    shutil.copy(bpe_69_path, os.path.join(output_dir, 'bpe_69.json'))

for root, dirs, files in os.walk(data_dir):
    for file in tqdm(files):
        if file == 'audio_sum.hdf5':
            hdf5_files.append(os.path.join(root, file))
        elif file.endswith('.wav'):
            shutil.copy(os.path.join(root, file), os.path.join(output_dir, file))
        elif file == 'audio_ann_sum.txt':
            with open(os.path.join(root, file), 'r') as f:
                content = f.read()
            audio_ann_content += content + '\n'

with open(os.path.join(output_dir, 'audio_ann_sum.txt'), 'w') as f:
    f.write(audio_ann_content)

merge_hdf5(hdf5_files, os.path.join(output_dir, 'audio_sum.hdf5'))

In [6]:
import utils.g2p.cleaners
from tokenizers import Tokenizer

def _clean_text(text, cleaner_names):
  for name in cleaner_names:
    cleaner = getattr(utils.g2p.cleaners, name)
    if not cleaner:
      raise Exception('Unknown cleaner: %s' % name)
    text, langs = cleaner(text)
  return text, langs

tokenizer = Tokenizer.from_file("utils/g2p/bpe_69.json")

In [14]:
file_path = 'MyTTSDataset/valid_tune/audio_ann_sum.txt'

# Step 1: Read all lines from the file
with open(file_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

clean_lines = []

for line in tqdm(lines):
    part = line.split('|')
    if len(part) >= 3:
        text = part[3]
        phonemes, langs = _clean_text(text, ['cje_cleaners'])
        phonemes = phonemes.replace(" ", "_")
        # 3. tokenize phonemes
        phoneme_tokens = tokenizer.encode(phonemes).ids
        if not len(phoneme_tokens):
            print('>>>> drop',text)
            continue
        else:
            clean_lines.append(line+'\n')

clean_lines = [line if line.endswith('\n') else line + '\n' for line in clean_lines]
# Step 3: Write the remaining lines back to the file
with open('audio_ann_sum.txt', 'w', encoding='utf-8') as f:
    f.writelines(clean_lines)

print(f"The line has been removed from {file_path}")


  0%|          | 0/3811 [00:00<?, ?it/s]

The line has been removed from MyTTSDataset/valid_tune/audio_ann_sum.txt
