|
""" Whisper training script using Hugging Face Transformers. """ |
|
|
|
import os |
|
from dataclasses import dataclass |
|
from math import ceil |
|
|
|
import evaluate |
|
import torch |
|
import wandb |
|
from datasets import IterableDatasetDict, load_dataset |
|
from transformers import ( |
|
AutoConfig, |
|
AutoModelForSpeechSeq2Seq, |
|
AutoProcessor, |
|
Seq2SeqTrainer, |
|
Seq2SeqTrainingArguments, |
|
TrainerCallback, |
|
WhisperProcessor |
|
) |
|
from transformers import set_seed |
|
from transformers.models.whisper.english_normalizer import BasicTextNormalizer |
|
from transformers.trainer_pt_utils import IterableDataset, IterableDatasetShard |
|
|
|
"""Then, we will load processor, model configuration, architecture and weights, and the dataset (in streaming mode). The English split of Fleurs is not a massive dataset, thus we could easily download it and store it in memory, but it is good to learn how to use the streaming mode if you were to fine-tune your model on larger datasets. """ |
|
|
|
model_id = "openai/whisper-tiny" |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
config = AutoConfig.from_pretrained(model_id) |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) |
|
|
|
dataset_id = "google/fleurs" |
|
dataset_language_code = "sv_se" |
|
dataset = load_dataset(dataset_id, dataset_language_code, streaming=True) |
|
|
|
"""The first time you run this code, make sure everything works fine using a small sample and low number of training steps. Just uncomment the next cell and run it. One note: since the dataset is loaded in streaming mode, the instruction will not be executed immediately. Instead, the dataset will be subsampled only when data will be needed during training.""" |
|
|
|
test_script = True |
|
|
|
|
|
|
|
if test_script is True: |
|
dataset["train"] = dataset["train"].shuffle(seed=42).take(8) |
|
dataset["validation"] = dataset["validation"].shuffle(seed=42).take(4) |
|
dataset["test"] = dataset["test"].shuffle(seed=42).take(4) |
|
|
|
"""The raw dataset is not yet ready for training. As described in my first about Whisper, the input audio waveform needs to be transformed into a Log-mel Spectrogram. I recommend you to read the [Audio Preprocessing section](https://marinone94.github.io/Whisper-paper/#audio-preprocessing) to understand the process. For the scope of this article, you should just know that the audio is translated from the time domain to its frequency representation using a sliding window, and adjusted to simulate human hearing. The Whisper Feature Extractor included in the Whisper Processor will take care of the rest. |
|
|
|
Furthermore, the reference transcripts need to be tokenized, since the model outputs one token at the time and they are used to compute the loss during training. Again, the Tokenizer will take care of that, but the task needs to be included in the preprocessing step. |
|
|
|
When we introduced the WER metric, we learned about the importance of normalizing the texts. But should we do that also before training? That is up to you, but you should remember that Whisper models have been pretrained to predict Capitalization, digits, and punctuation. So if you normalize the reference teanscripts before fine-tuning, you will teach model not to predict capital letters, digits, and punctuations. This does not mean that the model will never predict them, since it has been extensively pretrained to do so. To wrap up, your choice should depend on the final application and the dataset size, but in general I recommend not to normalize the references before training. |
|
|
|
Finally, by storing the input features in the default model input name, the trainer will automatically pick the correct ones during training. Thus, don't hard-code it! |
|
""" |
|
|
|
normalizer = BasicTextNormalizer() |
|
|
|
model_input_name = processor.feature_extractor.model_input_names[0] |
|
|
|
def prepare_dataset(batch, normalize=False): |
|
|
|
sample = batch["audio"] |
|
inputs = processor.feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) |
|
|
|
batch[model_input_name] = inputs.get(model_input_name)[0] |
|
batch["input_length"] = len(sample["array"]) |
|
|
|
|
|
if normalize is True: |
|
labels = batch["raw_transcription"].lower() |
|
labels = normalizer(labels).strip() |
|
else: |
|
labels = batch["raw_transcription"].strip() |
|
batch["labels"] = processor.tokenizer(labels).input_ids |
|
return batch |
|
|
|
"""We will use the `.map` method to apply our preprocessing function to the whole dataset. At the same time, we will drop all the columns which are not strictly needed during training. Since `input_features`, `ìnput_length` and `labels` are not features of the raw dataset, we can remove all the original ones. Finally, we will convert the dataset features to `torch` type since the dataset has no `__len__`property (again, we are in streaming mode). """ |
|
|
|
|
|
|
|
raw_datasets_features = list(dataset["train"].features.keys()) |
|
preprocessed_dataset = IterableDatasetDict() |
|
|
|
preprocessed_dataset["train"] = dataset["train"].map( |
|
prepare_dataset, |
|
remove_columns=raw_datasets_features, |
|
fn_kwargs={"normalize": False}, |
|
).with_format("torch") |
|
preprocessed_dataset["validation"] = dataset["validation"].map( |
|
prepare_dataset, |
|
remove_columns=raw_datasets_features, |
|
fn_kwargs={"normalize": False}, |
|
).with_format("torch") |
|
preprocessed_dataset["test"] = dataset["test"].map( |
|
prepare_dataset, |
|
remove_columns=raw_datasets_features, |
|
fn_kwargs={"normalize": False}, |
|
).with_format("torch") |
|
|
|
"""Since we want to evaluate our model on the validation set during training, we also need to provide a method that computes the metrics given the model predictions. It looks very similar to the function we introduced above, but since it will receive a single prediction object, we need to extract the predicted tokens and the corresponding labels. Furthermore, we replace the label ids equal to -100 with the padding token. A couple of minutes of patience and you will understand why. |
|
|
|
When decoding the prediction and the labels, we need to discard the special tokens. Those are used to force the model to perform specific tasks. You can read more [here](https://marinone94.github.io/Whisper-paper/#tasks). |
|
""" |
|
|
|
metric = evaluate.load("wer") |
|
|
|
def compute_metrics(pred): |
|
|
|
pred_ids = pred.predictions |
|
label_ids = pred.label_ids |
|
|
|
|
|
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id |
|
|
|
|
|
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True) |
|
label_str = processor.batch_decode(label_ids, skip_special_tokens=True) |
|
|
|
|
|
pred_str = [normalizer(pred) for pred in pred_str] |
|
label_str = [normalizer(label) for label in label_str] |
|
|
|
|
|
pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0] |
|
label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0] |
|
|
|
|
|
wer = 100 * metric.compute(predictions=pred_str, references=label_str) |
|
|
|
return {"wer": wer} |
|
|
|
"""Alright, we are almost done preparing our dataset. Quite a lot of work, I know, but that is most of the job. |
|
|
|
The last step is to define a data collator, which will build data btaches from the datasets during training using the Whisper Processor. It will also pad input features and labels. |
|
|
|
Also, in the metrics computation method we replaced the labels with id equal to -100. It was done because the data collator **must** set the padding tokens to -100 so that the trainer will ignore them when computing the loss. That was the reverse step. |
|
""" |
|
|
|
@dataclass |
|
class DataCollatorSpeechSeq2SeqWithPadding: |
|
|
|
processor: WhisperProcessor |
|
decoder_start_token_id: int |
|
|
|
def __call__(self, features): |
|
|
|
|
|
model_input_name = self.processor.model_input_names[0] |
|
input_features = [{model_input_name: feature[model_input_name]} for feature in features] |
|
label_features = [{"input_ids": feature["labels"]} for feature in features] |
|
|
|
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") |
|
|
|
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") |
|
|
|
|
|
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) |
|
|
|
|
|
|
|
if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item(): |
|
labels = labels[:, 1:] |
|
|
|
batch["labels"] = labels |
|
|
|
return batch |
|
|
|
data_collator = DataCollatorSpeechSeq2SeqWithPadding( |
|
processor=processor, |
|
decoder_start_token_id=model.config.decoder_start_token_id, |
|
) |
|
|
|
"""Next step was something I would have definitely missed I had not attended the 🤗 Whisper Fine-Tuning Event. Thanks, guys, I learned a ton! |
|
|
|
Still, there is something misterious to me, so I would love if someone explained it to me. Streaming datasets are not automatically shuffled after each epoch, therefore we define a Callback to do so. However, if we set the number of epochs in the Training Arguments (which we will see shortly), the Trainer complains that the datset has no length, and it asks us to define the maximum number of training steps. So, will this Callback ever be used? Or the Trainer will not be aware of having completed an epoch? Thanks in advance to whoever will clarify this to me! |
|
""" |
|
|
|
|
|
|
|
class ShuffleCallback(TrainerCallback): |
|
def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs): |
|
if isinstance(train_dataloader.dataset, IterableDatasetShard): |
|
pass |
|
elif isinstance(train_dataloader.dataset, IterableDataset): |
|
train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1) |
|
|
|
"""We are finally done preparing our data! But do you remember that Whisper is a multi-task Speech Recognition model? And that the task is simply induced using special prefix tokens? Good, now it is time to instruct the model. To do so, we can set those special tokens using the Tokenizer embedded in the Processor. |
|
|
|
In our specific case, we could skip this step since English transcription is the default behaviour. Still, this is how you would do if you were in a multilingual setting. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
processor.tokenizer.set_prefix_tokens(language="sv", task="transcribe") |
|
|
|
|
|
|
|
|
|
"""(Here you can see what happens if we define only the number of epochs. Scroll down a bit to see explanation and working implementation of Training Arguments and Trainer).""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Cool, we are almost ready for training! Let's define (and create, if missing) the output directory and define some Training Arguments. You can read about all the parameterse on the [🤗 docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments). |
|
|
|
Here, we will instruct the trainer to both train and evaluate the model, define how often metrics should be logged, evaluation should be performed on the evaluation set, model saved, and what batch size to use. The model - in this configuration - **will not be** pushed to the 🤗 hub since it is quite slow. Make sure to authenticate, create a repo and push your model if you train a large model, or use a large dataset! |
|
|
|
We will also use mixed precision (16-bit floating point, or fp16) if we are running our training on a GPU. |
|
|
|
We will also instruct the model to use the `generate` method for evaluation. That method is used for inference, and it applies a decoding technique to the predicted logits. In this case, it will use greedy search, since we set the number of beams to 1. I briefly introduced decoding algorithgms in the [Decoder paragraph](https://marinone94.github.io/Whisper-paper/#decoder) of my first article, but for now you can simply think of it as selecting the next token as the highest probability, after applying a softmax to the logits. I am considering writing a post about the impact of decoding algorithms on Whisper performance, so let me know you are interested! |
|
|
|
Last, we can track our training using several experiment tracking tools. I use Weights and Biases - great tool, you should definitely have a look - but 🤗 supports also "azure_ml", "comet_ml", "mlflow", "neptune" and "tensorboard". You can use "all" (default) to report to all integrations installed, "none" for no integrations. Since WandB is installed in this environment, you should explicitely set it to "none" if you don't have an account. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb.login() |
|
wandb.init(project="whisper-training-post") |
|
report_to = "wandb" |
|
|
|
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
use_fp16 = (device == "cuda") |
|
|
|
|
|
|
|
train_bs = 4 if test_script is True else 64 |
|
eval_bs = 2 if test_script is True else 32 |
|
|
|
|
|
|
|
num_training_samples = 2385 |
|
num_epochs = 3 |
|
max_steps_full_training = ceil(num_training_samples * num_epochs / train_bs) |
|
max_steps = 2 if test_script is True else max_steps_full_training |
|
|
|
|
|
|
|
eval_steps = 1 if test_script is True else int(max_steps / 10) |
|
logging_steps = 1 if test_script is True else int(max_steps / 100) |
|
|
|
training_args = Seq2SeqTrainingArguments( |
|
output_dir=".", |
|
do_train=True, |
|
do_eval=True, |
|
max_steps=max_steps, |
|
evaluation_strategy="steps", |
|
eval_steps=eval_steps, |
|
logging_strategy="steps", |
|
logging_steps=logging_steps, |
|
save_strategy="steps", |
|
save_steps=eval_steps, |
|
save_total_limit=2, |
|
learning_rate=1e-5, |
|
warmup_ratio=0.5 if test_script is True else 0.2, |
|
per_device_train_batch_size=train_bs, |
|
per_device_eval_batch_size=eval_bs, |
|
|
|
fp16=use_fp16, |
|
predict_with_generate=True, |
|
generation_num_beams=1, |
|
|
|
report_to=report_to |
|
) |
|
|
|
"""Now we can provide the trainer with the model, tokenizer (important: use the one you set language and task to! In this example, it is `processor.tokenizer`), training arguments, datasets, data collator, callback, and the method to compute metrics during evaluation. |
|
|
|
Note that we don't need to place the model to the accelerator device, nor we had to do it in the data collator with the dataset! The trainer will take care of it, if a GPU is available. |
|
""" |
|
|
|
|
|
trainer = Seq2SeqTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=preprocessed_dataset["train"], |
|
eval_dataset=preprocessed_dataset["validation"], |
|
tokenizer=processor.feature_extractor, |
|
data_collator=data_collator, |
|
compute_metrics=compute_metrics, |
|
callbacks=[ShuffleCallback()] |
|
) |
|
|
|
"""Let's |
|
|
|
I hope you haven't left yet. If you have, bad for you, as we are ready for training our model! 🍾 |
|
As Whisper is a pretrained model ready to be used off-the-shelf, it is advisable to evaluate it before training on both the validation and test sets. Let's make sure we make no harm to it. |
|
""" |
|
|
|
eval_metrics = trainer.evaluate( |
|
eval_dataset=preprocessed_dataset["validation"], |
|
metric_key_prefix="eval", |
|
max_length=448, |
|
num_beams=1, |
|
|
|
) |
|
|
|
trainer.log_metrics("eval", eval_metrics) |
|
trainer.save_metrics("eval", eval_metrics) |
|
print(eval_metrics) |
|
|
|
test_metrics = trainer.evaluate( |
|
eval_dataset=preprocessed_dataset["test"], |
|
metric_key_prefix="test", |
|
max_length=448, |
|
num_beams=1, |
|
|
|
) |
|
|
|
trainer.log_metrics("test", test_metrics) |
|
trainer.save_metrics("test", test_metrics) |
|
print(test_metrics) |
|
|
|
train_result = trainer.train() |
|
trainer.save_model() |
|
|
|
metrics = train_result.metrics |
|
trainer.log_metrics("train", metrics) |
|
trainer.save_metrics("train", metrics) |
|
trainer.save_state() |
|
print(metrics) |
|
|
|
"""ADD SOMETHING ABOUT THE TRAINING. |
|
|
|
Now let's evaluate the |
|
""" |
|
|
|
final_metrics = trainer.evaluate( |
|
eval_dataset=preprocessed_dataset["test"], |
|
metric_key_prefix="test", |
|
max_length=448, |
|
num_beams=1, |
|
|
|
) |
|
|
|
trainer.log_metrics("test", final_metrics) |
|
trainer.save_metrics("test", final_metrics) |
|
print(final_metrics) |
|
|
|
|
|
|
|
trainer.push_to_hub() |