continuing training
Dear Fabio,
thanks so much for sharing the checkpoints. I would like to continue the training, to see if we can get a better pronunciation. Do you mind sharing the preprocessing/training script so I can use the same parameters and data splits?
As per the forgetting, we could interleave samples from other languages, but I am not completely against having a separate model per language, unless we see economy of scales.
Do you have any graphs showing the learning curves so far?
Best,
Mario
Hello Mario,
The model card contains already all the parameters I used, let me repeat them here
I used the F5 gradio finetuning tool with this settings (I don't have the settings.json anymore but this was the content that you have to translate to json or use straight in the gradio finetuning app
This was my second attempt with a different dataset (previously I used https://huggingface.co/datasets/facebook/multilingual_librispeech/viewer/italian without any pre-elaboration) in Italian only, having a multilingual would be better because nowadays Italian has anyway lot of english words that are used, especially in business or tech domain.
exp_name"F5TTS_Base"
learning_rate=0.00001
batch_size_per_gpu=10000
batch_size_type="frame"
max_samples=64
grad_accumulation_steps=1
max_grad_norm=1
epochs=300
num_warmup_updates=2000
save_per_updates=600
last_per_steps=300
finetune=true
file_checkpoint_train=""
tokenizer_type="char"
tokenizer_file=""
mixed_precision="fp16"
logger="wandb"
bnb_optimizer=false
about the dataset, I created this script to extract the audio in the proper format and create what's needed by the gradio_finetune to preprocess the files
import requests
import soundfile as sf
import asyncio
import concurrent.futures
import time
import csv
import shutil
import os
import re
def prepare_metadata_csv(metadata_path):
# Before starting the processing, clear metadata.csv and create a backup
if os.path.exists(metadata_path):
shutil.copy(metadata_path, metadata_path + '.bak') # Create a backup
os.remove(metadata_path) # Remove the file
with open(metadata_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter='|')
writer.writerow(['filename', 'transcription'])
# ylacombe dataset contains a text field that has character sequences to be fixed:
# ' - ' is used to specify a start/end segment of direct dialogue
# '- ' is used to break a word in a new line
def process_transcription(original) -> str:
text = re.sub(r'(?<!\s)-\s', '', original)
if(text != original):
print(f"Original: {original}")
print(f"Processed: {text}")
return text
# Function to process and save a single audio file
def process_sample(audio_path, i, item):
try:
audio = item["audio"]
audio_data = audio["array"]
sampling_rate = audio["sampling_rate"]
transcription = process_transcription(item["text"])
# Save audio file in WAV format
filename = f"{i}.wav"
full_filename_path = os.path.join(audio_path, filename)
sf.write(full_filename_path, audio_data, sampling_rate, format="WAV")
return filename, transcription
except Exception as e:
print(f"An unexpected error occurred while processing file {i}: {e}")
async def fetch_batch(dataset_iterator, batch_size):
it_audio_files = []
for _ in range(batch_size):
try:
it_audio_files.append(next(dataset_iterator))
except StopIteration:
break
return it_audio_files
async def process_batches(project_dir, dataset, batch_size, max_samples = None):
audio_path = f"{project_dir}/wavs"
os.makedirs(audio_path, exist_ok=True) # Create directory if it doesn't exist
# Write to metadata.csv
metadata_path = f"{project_dir}/metadata.csv"
prepare_metadata_csv(metadata_path)
start_time = time.time()
dataset_iterator = iter(dataset)
i = 0
# Get the first batch outside the loop
next_batch_future = asyncio.ensure_future(fetch_batch(dataset_iterator, batch_size))
while True:
current_batch = await next_batch_future # Wait for the next batch to be fetched
if not current_batch: # If the batch is empty (StopIteration), break the loop
break
# Start fetching the next batch while processing the current batch
next_batch_future = asyncio.ensure_future(fetch_batch(dataset_iterator, batch_size))
with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(lambda args: process_sample(audio_path, *args),
zip(range(i, i + len(current_batch)), current_batch))
with open(metadata_path, 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter='|')
for filename, transcription in results:
if filename and transcription: # Check for None values
writer.writerow([filename, transcription])
i += len(current_batch)
if i is not None and i >= max_samples:
break
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds to save {i} audio files")
from datasets import load_dataset
import nest_asyncio
import time
import os
nest_asyncio.apply()
# specificare il nome della cartella dove finiranno i sample audio e file metadata.csv
project_dir = "/workspace/F5-TTS/data/italian-ylacombe_char"
dataset = load_dataset("ylacombe/cml-tts", "italian", split="train", streaming=True)
asyncio.run(process_batches(project_dir, dataset, batch_size=4, max_samples=200))
in the gradio finetuning, create a new project and then set
ensure to set project_dir
to the path in the ff5/data
folder where your project has been created
I don't have any loss curve sorry, was my first attempts at finetuning something
Let me know if you need anything else and I'll be glad to see if you can improve my model because as you noticed it misspronounce some words