In [1]:
from datasets import load_dataset, DatasetDict
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import WhisperProcessor
from datasets import Audio
import evaluate

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [2]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "ml", split="train[:5%]+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "ml", split="test[:5%]", use_auth_token=True)

print(common_voice)

Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f)
Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f)


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 22
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 6
    })
})


In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Malayalam", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Malayalam", task="transcribe")

In [4]:
print(common_voice["train"][0])

{'client_id': '29ca16eb2c0faea0be0ad73b5d826f5e81dc6fd4acfa9241a002b5d3619fd51c5b00b009e7b98b50caa5829f8a96697d5942b120749ee63a5d637c632bd0f7bc', 'path': '/home/.cache/huggingface/datasets/downloads/extracted/5e6fee23ff6621c1021a557e4424852db80c5f277edb03408614c85e4831964c/common_voice_ml_28913601.mp3', 'audio': {'path': '/home/.cache/huggingface/datasets/downloads/extracted/5e6fee23ff6621c1021a557e4424852db80c5f277edb03408614c85e4831964c/common_voice_ml_28913601.mp3', 'array': array([-5.9054565e-16, -5.8716256e-14, -5.4170010e-15, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32), 'sampling_rate': 48000}, 'sentence': 'എന്തുകൊണ്ട് യുവാക്കൾ കൂടുതൽ രാഷ്ട്രീയമായി ചിന്തിക്കണം, എന്തുകൊണ്ട് അവർ സംഘടിതരാകണം എന്നതിന്റെ ഉദാത്തമായ ഉദാഹരണമാകുന്നു കേരളം.', 'up_votes': 2, 'down_votes': 0, 'age': '', 'gender': '', 'accent': '', 'locale': 'ml', 'segment': ''}


In [5]:
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [6]:
print(common_voice["train"][0])

{'client_id': '29ca16eb2c0faea0be0ad73b5d826f5e81dc6fd4acfa9241a002b5d3619fd51c5b00b009e7b98b50caa5829f8a96697d5942b120749ee63a5d637c632bd0f7bc', 'path': '/home/.cache/huggingface/datasets/downloads/extracted/5e6fee23ff6621c1021a557e4424852db80c5f277edb03408614c85e4831964c/common_voice_ml_28913601.mp3', 'audio': {'path': '/home/.cache/huggingface/datasets/downloads/extracted/5e6fee23ff6621c1021a557e4424852db80c5f277edb03408614c85e4831964c/common_voice_ml_28913601.mp3', 'array': array([-4.3097585e-14,  1.7633505e-13,  2.9013527e-13, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32), 'sampling_rate': 16000}, 'sentence': 'എന്തുകൊണ്ട് യുവാക്കൾ കൂടുതൽ രാഷ്ട്രീയമായി ചിന്തിക്കണം, എന്തുകൊണ്ട് അവർ സംഘടിതരാകണം എന്നതിന്റെ ഉദാത്തമായ ഉദാഹരണമാകുന്നു കേരളം.', 'up_votes': 2, 'down_votes': 0, 'age': '', 'gender': '', 'accent': '', 'locale': 'ml', 'segment': ''}


In [7]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

do_lower_case = False
do_remove_punctuation = True

normalizer = BasicTextNormalizer()

In [8]:
def prepare_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    # optional pre-processing steps
    transcription = batch["sentence"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = normalizer(transcription).strip()
    
    # encode target text to label ids
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

In [9]:
%%time
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)

Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-10c57a3e7cf91619.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-8adb63851a4a51f7.arrow


CPU times: user 2.97 s, sys: 16 ms, total: 2.99 s
Wall time: 2.99 s


In [10]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [11]:
common_voice["train"] = common_voice["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-153a5b29ef28024e.arrow


In [12]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [14]:
import evaluate

metric = evaluate.load("wer")

In [15]:
# evaluate with the 'normalised' WER
do_normalize_eval = True

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    if do_normalize_eval:
        pred_str = [normalizer(pred) for pred in pred_str]
        label_str = [normalizer(label) for label in label_str]

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [16]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [17]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

PyTorch: setting up devices


In [23]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

/home/whisper-ml-first-model/./ is already a clone of https://huggingface.co/kurianbenoy/whisper-ml-first-model. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [24]:
processor.save_pretrained(training_args.output_dir)

Feature extractor saved in ./preprocessor_config.json
tokenizer config file saved in ./tokenizer_config.json
Special tokens file saved in ./special_tokens_map.json
added tokens file saved in ./added_tokens.json


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22
  Num Epochs = 500
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 500
  Number of trainable parameters = 37760640


Step,Training Loss,Validation Loss


In [29]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "language": "ml",
    "model_name": "Whisper tiny ml - Kurian Benoy",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
    "tags": "whisper-event",
}
trainer.push_to_hub(**kwargs)

Saving model checkpoint to ./
Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin
Feature extractor saved in ./preprocessor_config.json


TypeError: create_model_card() got multiple values for keyword argument 'model_name'