XTTSv2-Finetuning-Vi / extend_vocab_config.py
tuandaodev's picture
Upload folder using huggingface_hub
c1bab10 verified
import argparse
from tokenizers import Tokenizer
import os
import pandas as pd
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
import json
def combine_tokenizers(old_tokenizer, new_tokenizer, save_dir):
# Load both the json files, take the union, and store it
json1 = json.load(open(os.path.join(old_tokenizer, 'vocab.json')))
json2 = json.load(open(os.path.join(new_tokenizer, 'vocab.json')))
# Create a new vocabulary
new_vocab = {}
idx = 0
for word in json1.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Add words from second tokenizer
for word in json2.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Make the directory if necessary
os.makedirs(save_dir, exist_ok=True)
# Save the vocab
with open(os.path.join(save_dir, 'vocab.json'), 'w') as fp:
json.dump(new_vocab, fp, ensure_ascii=False)
# Merge the two merges file. Don't handle duplicates here
# Concatenate them, but ignore the first line of the second file
os.system('cat {} > {}'.format(os.path.join(old_tokenizer, 'merges.txt'), os.path.join(save_dir, 'merges.txt')))
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(new_tokenizer, 'merges.txt'), os.path.join(save_dir, 'merges.txt')))
def extend_tokenizer(args):
root = os.path.join(args.output_path, "XTTS_v2.0_original_model_files/")
# save seperately vocab, merges
existing_tokenizer = Tokenizer.from_file(os.path.join(root, "vocab.json"))
old_tokenizer_path = os.path.join(root, "old_tokenizer/")
os.makedirs(old_tokenizer_path, exist_ok=True)
existing_tokenizer.model.save(old_tokenizer_path)
# train new tokenizer
traindf = pd.read_csv(args.metadata_path, sep="|")
texts = traindf.text.to_list()
new_tokenizer = Tokenizer(BPE())
new_tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=[f"[{args.language}]"], vocab_size=args.extended_vocab_size)
new_tokenizer.train_from_iterator(iter(texts), trainer=trainer)
new_tokenizer.add_special_tokens([f"[{args.language}]"])
new_tokenizer_path = os.path.join(root, "new_tokenizer/")
os.makedirs(new_tokenizer_path, exist_ok=True)
new_tokenizer.model.save(new_tokenizer_path)
merged_tokenizer_path = os.path.join(root, "merged_tokenizer/")
combine_tokenizers(
old_tokenizer_path,
new_tokenizer_path,
merged_tokenizer_path
)
tokenizer = Tokenizer.from_file(os.path.join(root, "vocab.json"))
tokenizer.model = tokenizer.model.from_file(os.path.join(merged_tokenizer_path, 'vocab.json'), os.path.join(merged_tokenizer_path, 'merges.txt'))
tokenizer.add_special_tokens([f"[{args.language}]"])
tokenizer.save(os.path.join(root, "vocab.json"))
os.system(f'rm -rf {old_tokenizer_path} {new_tokenizer_path} {merged_tokenizer_path}')
def adjust_config(args):
config_path = os.path.join(args.output_path, "XTTS_v2.0_original_model_files/config.json")
with open(config_path, "r") as f:
config = json.load(f)
config["languages"] += [args.language]
with open(config_path, 'w') as f:
json.dump(config, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str, required=True, help="")
parser.add_argument("--metadata_path", type=str, required=True, help="")
parser.add_argument("--language", type=str, required=True, help="")
parser.add_argument("--extended_vocab_size", default=2000, type=int, required=True, help="")
args = parser.parse_args()
extend_tokenizer(args)
adjust_config(args)