alien79/F5-TTS-italian · continuing training

Hello Mario,
The model card contains already all the parameters I used, let me repeat them here
I used the F5 gradio finetuning tool with this settings (I don't have the settings.json anymore but this was the content that you have to translate to json or use straight in the gradio finetuning app

This was my second attempt with a different dataset (previously I used https://huggingface.co/datasets/facebook/multilingual_librispeech/viewer/italian without any pre-elaboration) in Italian only, having a multilingual would be better because nowadays Italian has anyway lot of english words that are used, especially in business or tech domain.

exp_name"F5TTS_Base"
learning_rate=0.00001
batch_size_per_gpu=10000
batch_size_type="frame"
max_samples=64
grad_accumulation_steps=1
max_grad_norm=1
epochs=300
num_warmup_updates=2000
save_per_updates=600
last_per_steps=300
finetune=true
file_checkpoint_train=""
tokenizer_type="char"
tokenizer_file=""
mixed_precision="fp16"
logger="wandb"
bnb_optimizer=false

about the dataset, I created this script to extract the audio in the proper format and create what's needed by the gradio_finetune to preprocess the files

import requests
import soundfile as sf
import asyncio
import concurrent.futures
import time
import csv
import shutil
import os
import re

def prepare_metadata_csv(metadata_path):
   # Before starting the processing, clear metadata.csv and create a backup
  if os.path.exists(metadata_path):
      shutil.copy(metadata_path, metadata_path + '.bak')  # Create a backup
      os.remove(metadata_path)  # Remove the file

  with open(metadata_path, 'w', newline='') as csvfile:
      writer = csv.writer(csvfile, delimiter='|')
      writer.writerow(['filename', 'transcription'])

# ylacombe dataset contains a text field that has character sequences to be fixed:
# ' - ' is used to specify a start/end segment of direct dialogue
# '- ' is used to break a word in a new line
def process_transcription(original) -> str:
  text = re.sub(r'(?<!\s)-\s', '', original)

  if(text != original):
    print(f"Original: {original}")
    print(f"Processed: {text}")

  return text

# Function to process and save a single audio file
def process_sample(audio_path, i, item):
    try:
        audio = item["audio"]
        audio_data = audio["array"]
        sampling_rate = audio["sampling_rate"]

        transcription = process_transcription(item["text"])

        # Save audio file in WAV format
        filename = f"{i}.wav"
        full_filename_path = os.path.join(audio_path, filename)
        sf.write(full_filename_path, audio_data, sampling_rate, format="WAV")

        return filename, transcription

    except Exception as e:
        print(f"An unexpected error occurred while processing file {i}: {e}")

async def fetch_batch(dataset_iterator, batch_size):
    it_audio_files = []
    for _ in range(batch_size):
        try:
            it_audio_files.append(next(dataset_iterator))
        except StopIteration:
            break
    return it_audio_files

async def process_batches(project_dir, dataset, batch_size, max_samples = None):
  audio_path = f"{project_dir}/wavs"
  os.makedirs(audio_path, exist_ok=True)  # Create directory if it doesn't exist

  # Write to metadata.csv
  metadata_path = f"{project_dir}/metadata.csv"

  prepare_metadata_csv(metadata_path)

  start_time = time.time()
  dataset_iterator = iter(dataset)
  i = 0

  # Get the first batch outside the loop
  next_batch_future = asyncio.ensure_future(fetch_batch(dataset_iterator, batch_size))

  while True:
      current_batch = await next_batch_future  # Wait for the next batch to be fetched
      if not current_batch:  # If the batch is empty (StopIteration), break the loop
          break

      # Start fetching the next batch while processing the current batch
      next_batch_future = asyncio.ensure_future(fetch_batch(dataset_iterator, batch_size))

      with concurrent.futures.ThreadPoolExecutor() as executor:
          results = executor.map(lambda args: process_sample(audio_path, *args),
                                   zip(range(i, i + len(current_batch)), current_batch))

      with open(metadata_path, 'a', newline='') as csvfile:
          writer = csv.writer(csvfile, delimiter='|')
          for filename, transcription in results:
              if filename and transcription:  # Check for None values
                  writer.writerow([filename, transcription])

      i += len(current_batch)
      if i is not None and i >= max_samples:
        break

  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Elapsed time: {elapsed_time} seconds to save {i} audio files")
  
  
  
from datasets import load_dataset
import nest_asyncio
import time
import os

nest_asyncio.apply()

# specificare il nome della cartella dove finiranno i sample audio e file metadata.csv
project_dir = "/workspace/F5-TTS/data/italian-ylacombe_char"

dataset = load_dataset("ylacombe/cml-tts", "italian", split="train", streaming=True)
asyncio.run(process_batches(project_dir, dataset, batch_size=4, max_samples=200))

in the gradio finetuning, create a new project and then set
ensure to set project_dir to the path in the ff5/data folder where your project has been created

I don't have any loss curve sorry, was my first attempts at finetuning something
Let me know if you need anything else and I'll be glad to see if you can improve my model because as you noticed it misspronounce some words