In [None]:
%%capture
!pip install datasets==1.4.1
!pip install transformers==4.4.0
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install mecab-python3
!pip install unidic-lite
!pip isntall audiomentations

In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from datasets import load_dataset, load_metric, ClassLabel, Dataset
from audiomentations import Compose, AddGaussianNoise, Gain, PitchShift, TimeStretch, Shift
from torch.optim.lr_scheduler import LambdaLR
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

import pandas as pd
import numpy as np
import soundfile as sf
import re
import json
import torchaudio
import librosa
import datasets
import MeCab
import pykakasi
import random

import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

# Load dataset and prepare processor

In [None]:
# Load public dataset from University of Tokyo
!wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip
!unzip jsut_ver1.1.zip

path = 'jsut_ver1.1/basic5000/'
df = pd.read_csv(path + 'transcript_utf8.txt', header = None, delimiter = ":", names=["path", "sentence"], index_col=False)
df["path"] = df["path"].map(lambda x: path + 'wav/' + x + ".wav")
df.head()

jsut_voice_train = Dataset.from_pandas(df)

In [None]:
# Import training dataset
common_voice_train = load_dataset('common_voice', 'ja',split='train+validation')
common_voice_test = load_dataset('common_voice', 'ja', split='test')

# Remove unwanted columns
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

# Concat common voice and public dataset
common_voice_train = datasets.concatenate_datasets([jsut_voice_train, common_voice_train])

In [None]:
# Parser Japanese sentence. Ex: "pythonが大好きです" -> "python が 大好き です EOS"
wakati = MeCab.Tagger("-Owakati")

# Unwanted token
chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'

def remove_special_characters(batch):
 batch["sentence"] = wakati.parse(batch["sentence"]).strip()
 batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
 return batch

common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

In [None]:
# make vocab file
def extract_all_chars(batch):
 all_text = " ".join(batch["sentence"])
 vocab = list(set(all_text))
 return {"vocab": [vocab], "all_text": [all_text]}

# make vocab list and text
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

# concate vocab from train and test set
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
print(len(vocab_dict))
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

# create unk and pad token
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

# save to json file
with open('vocab.json', 'w') as vocab_file:
 json.dump(vocab_dict, vocab_file, indent=2, ensure_ascii=False)

In [None]:
save_dir = "./output_models"
# wrap tokenizer and feature extractor to processor
tokenizer = Wav2Vec2CTCTokenizer("./vocab_demo.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(save_dir)

# Prepare train and test dataset 

In [None]:
# convert audio from 48kHz to 16kHz (standard sample rate of wave2vec model)
def speech_file_to_array_fn(batch):
 speech_array, sampling_rate = torchaudio.load(batch["path"])
 batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), 48_000, 16_000)
 batch["sampling_rate"] = 16_000
 batch["target_text"] = batch["sentence"]
 return batch

common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names,num_proc=4)
common_voice_test = common_voice_test.map(speech_file_to_array_fn,remove_columns=common_voice_test.column_names, num_proc=4) 

In [None]:
# do augment to enrich common voice dataset 
augment = Compose([
 AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.001, p=0.8),
 PitchShift(min_semitones=-1, max_semitones=1, p=0.8),
 Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8),
 TimeStretch(min_rate=0.8, max_rate=1.25, p=0.8)

])

def augmented_speech(batch, augment):
 samples = np.array(batch["speech"])
 batch["speech"] = augment(samples=samples, sample_rate=16000)
 batch["sampling_rate"] = 16_000
 batch["target_text"] = batch["target_text"]
 return batch

# augument 50% of trainset
common_voice_train_augmented = common_voice_train.train_test_split(test_size = 0.5)['train']
common_voice_train_augmented = common_voice_train_augmented.map(lambda batch: augmented_speech(batch, augment), num_proc=4)

# concate with trainset
common_voice_train = datasets.concatenate_datasets([common_voice_train_augmented, common_voice_train])

In [None]:
def prepare_dataset(batch):
 # check that all files have the correct sampling rate
 assert (
 len(set(batch["sampling_rate"])) == 1
 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

 batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
 
 with processor.as_target_processor():
 batch["labels"] = processor(batch["target_text"]).input_ids
 return batch
 
# prepare dataset
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

# Training

In [None]:
# create data collator
@dataclass
class DataCollatorCTCWithPadding:

 processor: Wav2Vec2Processor
 padding: Union[bool, str] = True
 max_length: Optional[int] = None
 max_length_labels: Optional[int] = None
 pad_to_multiple_of: Optional[int] = None
 pad_to_multiple_of_labels: Optional[int] = None

 def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
 input_features = [{"input_values": feature["input_values"]} for feature in features]
 label_features = [{"input_ids": feature["labels"]} for feature in features]

 batch = self.processor.pad(
 input_features,
 padding=self.padding,
 max_length=self.max_length,
 pad_to_multiple_of=self.pad_to_multiple_of,
 return_tensors="pt",
 )
 with self.processor.as_target_processor():
 labels_batch = self.processor.pad(
 label_features,
 padding=self.padding,
 max_length=self.max_length_labels,
 pad_to_multiple_of=self.pad_to_multiple_of_labels,
 return_tensors="pt",
 )

 # replace padding with -100 to ignore loss correctly
 labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

 batch["labels"] = labels

 return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
# make metric function
wer_metric = load_metric("wer")

def compute_metrics(pred):
 pred_logits = pred.predictions
 pred_ids = np.argmax(pred_logits, axis=-1)

 pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

 pred_str = processor.batch_decode(pred_ids)
 # we do not want to group tokens when computing the metrics
 label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

 wer = wer_metric.compute(predictions=pred_str, references=label_str)

 return {"wer": wer}

In [None]:
# create custom learning scheduler

# polynomial decay
def get_polynomial_decay_schedule_with_warmup(
 optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.2, last_epoch=-1
):

 lr_init = optimizer.defaults["lr"]
 assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"

 def lr_lambda(current_step: int):
 if current_step < num_warmup_steps:
 return float(current_step) / float(max(1, num_warmup_steps))
 elif current_step > num_training_steps:
 return lr_end / lr_init # as LambdaLR multiplies by lr_init
 else:
 lr_range = lr_init - lr_end
 decay_steps = num_training_steps - num_warmup_steps
 pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
 decay = lr_range * pct_remaining ** power + lr_end
 return decay / lr_init # as LambdaLR multiplies by lr_init

 return LambdaLR(optimizer, lr_lambda, last_epoch)
 
# wrap custom learning scheduler with trainer
class PolyTrainer(Trainer):
 def __init__(self, *args, **kwargs):
 super().__init__(*args, **kwargs)
 
 def create_scheduler(self, num_training_steps: int):
 self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(self.optimizer, 
 num_warmup_steps=self.args.warmup_steps,
 num_training_steps=num_training_steps)
 def create_optimizer_and_scheduler(self, num_training_steps: int):
 self.create_optimizer()
 self.create_scheduler(num_training_steps)

In [None]:
# load pretrain model
model = Wav2Vec2ForCTC.from_pretrained(
 "facebook/wav2vec2-large-xlsr-53", 
 attention_dropout=0.1,
 hidden_dropout=0.1,
 feat_proj_dropout=0.1,
 mask_time_prob=0.1, 
 layerdrop=0.1,
 gradient_checkpointing=True, 
 ctc_loss_reduction="mean", 
 pad_token_id=processor.tokenizer.pad_token_id,
 vocab_size=len(processor.tokenizer)
)
# free feature extractor
model.freeze_feature_extractor()

# define train argument
training_args = TrainingArguments(
 output_dir=save_dir,
 group_by_length=True,
 per_device_train_batch_size=32,
 gradient_accumulation_steps=2,
 evaluation_strategy="steps",
 num_train_epochs=200,
 fp16=True,
 save_steps=2400, 
 eval_steps=800,
 logging_steps=800, 
 learning_rate=1e-4, 
 warmup_steps=1500, 
 save_total_limit=2,
 load_best_model_at_end = True, 
 metric_for_best_model='wer', 
 greater_is_better=False
)

# wrap everything to Trainer
trainer = PolyTrainer(
 model=model,
 data_collator=data_collator,
 args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=common_voice_train,
 eval_dataset=common_voice_test,
 tokenizer=processor.feature_extractor,
)

In [None]:
# training
train_result = trainer.train()

# Testing result

In [None]:
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import MeCab
import pykakasi
import re

#config
wakati = MeCab.Tagger("-Owakati")
chars_to_ignore_regex = '[\,\、\。\.\「\」\…\?\・]'

#load model
processor = Wav2Vec2Processor.from_pretrained(save_dir)
test_model = Wav2Vec2ForCTC.from_pretrained(save_dir)
test_model.to("cuda")
resampler = torchaudio.transforms.Resample(48_000, 16_000)

#load testdata
test_dataset = load_dataset("common_voice", "ja", split="test")
wer = load_metric("wer")

# Preprocessing the datasets.
def speech_file_to_array_fn(batch):
 batch["sentence"] = wakati.parse(batch["sentence"]).strip()
 batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
 speech_array, sampling_rate = torchaudio.load(batch["path"])
 batch["speech"] = resampler(speech_array).squeeze().numpy()
 return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)

# Preprocessing the datasets.
# We need to read the aduio files as arrays
def evaluate(batch):
 inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

 with torch.no_grad():
 logits = test_model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
 pred_ids = torch.argmax(logits, dim=-1)
 batch["pred_strings"] = processor.batch_decode(pred_ids)
 return batch

result = test_dataset.map(evaluate, batched=True, batch_size=8)

print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))

In [None]:
# print some reusults
pick = random.randint(0, len(common_voice_test_transcription)-1)
input_dict = processor(common_voice_test["input_values"][pick], return_tensors="pt", padding=True)
logits = test_model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]

print("Prediction:")
print(processor.decode(pred_ids).strip())

print("\nLabel:")
print(processor.decode(common_voice_test['labels'][pick]))
