In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
# !wget https://huggingface.co/huseinzol05/language-model-bahasa-manglish-combined/resolve/main/model.klm
# !pip3 install pyctcdecode==0.1.0 pypi-kenlm==0.1.20210121

In [3]:
import transformers
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
    set_seed,
    AutoModelForCTC,
    TFWav2Vec2ForCTC,
    TFWav2Vec2PreTrainedModel,
    Wav2Vec2PreTrainedModel,
)
from scipy.special import log_softmax



In [4]:
import torch

In [5]:
import string
import json

CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']
vocab_dict = {v: k for k, v in enumerate(CTC_VOCAB)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open("ctc-vocab.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer(
    "ctc-vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [6]:
from glob import glob
malay = sorted(glob('malay-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))
singlish = sorted(glob('singlish-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))
mandarin = sorted(glob('mandarin-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))
len(malay), len(singlish), len(mandarin)

(765, 3579, 614)

In [7]:
with open('malay-test.json') as fopen:
    malay_label = json.load(fopen)
with open('singlish-test.json') as fopen:
    singlish_label = json.load(fopen)
with open('mandarin-test.json') as fopen:
    mandarin_label = json.load(fopen)
    
len(malay_label), len(singlish_label), len(mandarin_label)

(765, 3579, 614)

In [8]:
from sklearn.utils import shuffle

audio = malay + singlish + mandarin
labels = malay_label + singlish_label + mandarin_label
audio, labels = shuffle(audio, labels)
test_set = list(zip(audio, labels))
test_set[:10]

[('singlish-test/3057.wav', 'the teenagers paddled hard on their boat'),
 ('malay-test/705.wav', 'kenapa justin trudeau seperti kemaluan wanita'),
 ('singlish-test/2631.wav',
  'a letter by a mans daughter pleading for leniency was submitted'),
 ('singlish-test/659.wav', 'and theres thousands of people to meet'),
 ('singlish-test/809.wav', 'how much lower are the prices'),
 ('singlish-test/2040.wav',
  'suddenly a gun shot was fired from a distance which sent the dogs fleeing in an instant'),
 ('singlish-test/1616.wav',
  'a stronger dollar pressures gold making it more expensive for holders of other currencies'),
 ('singlish-test/1816.wav',
  'family as a priority has become real for me and not just a cliche'),
 ('malay-test/147.wav',
  'adakah anda percaya bahawa donald trump adalah kedatangan kedua jesus christ'),
 ('singlish-test/3468.wav',
  'but much of the technology required for such a fantastic instrument didnt yet exist')]

In [9]:
import soundfile as sf
import numpy as np

def norm_audio(x):
    return (x - x.mean()) / np.sqrt(x.var() + 1e-7)

def sequence_1d(
    seq, maxlen=None, padding: str = 'post', pad_int=0, return_len=False
):
    if padding not in ['post', 'pre']:
        raise ValueError('padding only supported [`post`, `pre`]')

    if not maxlen:
        maxlen = max([len(s) for s in seq])

    padded_seqs, length = [], []
    for s in seq:
        if isinstance(s, np.ndarray):
            s = s.tolist()
        if padding == 'post':
            padded_seqs.append(s + [pad_int] * (maxlen - len(s)))
        if padding == 'pre':
            padded_seqs.append([pad_int] * (maxlen - len(s)) + s)
        length.append(len(s))
    if return_len:
        return np.array(padded_seqs), length
    return np.array(padded_seqs)

def batching(audios):
    audios = [sf.read(a)[0] for a in audios]
    batch, lens = sequence_1d(audios,return_len=True)
    attentions = [[1] * l for l in lens]
    attentions = sequence_1d(attentions)
    normed_input_values = []

    for vector, length in zip(batch, attentions.sum(-1)):
        normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
        if length < normed_slice.shape[0]:
            normed_slice[length:] = 0.0

        normed_input_values.append(normed_slice)

    normed_input_values = np.array(normed_input_values)
    return normed_input_values.astype(np.float32), attentions

In [10]:
model = AutoModelForCTC.from_pretrained(
    './checkpoint-115000',
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
).cuda()

In [11]:
_ = model.eval()

In [12]:
batch_size = 4
batch_x = audio[:batch_size]
normed_input_values, attentions = batching(batch_x)

In [13]:
o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)).cuda(), 
             attention_mask = torch.from_numpy(attentions).cuda())
o_pt = o_pt.logits.detach().cpu().numpy()
o_pt = log_softmax(o_pt, axis = -1)

In [14]:
pred_ids = np.argmax(o_pt, axis = -1)
tokenizer.batch_decode(pred_ids)

['the teenagers paddled hard on their boat',
 'kenapa justin tradio seperti kemaluan wanita',
 'a letter bya mans daughter pleading for lenien te was submitted',
 'and theres thousands of people to meet']

In [15]:
unique_vocab = list(vocab_dict.keys())
unique_vocab[-3] = ' ' 
unique_vocab[-2] = '?'
unique_vocab[-1] = '_'

In [16]:
from pyctcdecode import build_ctcdecoder
import kenlm

kenlm_model = kenlm.Model('model.klm')
decoder = build_ctcdecoder(
    unique_vocab,
    kenlm_model,
    alpha=0.2,
    beta=1.0,
    ctc_token_idx=tokenizer.pad_token_id
)

In [17]:
for k in range(len(o_pt)):
    out = decoder.decode_beams(o_pt[k], prune_history=True)
    d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
    print(k, d_lm2)

0 to know more about this years budget click here
1 you can bake shortbread cookies just with sugar butter and flour
2 all good citizens should learn how to change a light bulb
3 as a child madam surley was constantly teased by other children over her appearance


In [18]:
labels[:batch_size]

['to know more about this years budget click here',
 'you can bake shortbread cookies just with sugar butter and flour',
 'all good citizens should learn how to change a light bulb',
 'as a child madam shirley was constantly teased by other children over her appearance']

In [19]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [20]:
from tqdm import tqdm

wer, cer = [], []
wer_lm, cer_lm = [], []

for i in tqdm(range(0, len(audio), batch_size)):
    torch.cuda.empty_cache()
    
    batch_x = audio[i: i + batch_size]
    batch_y = labels[i: i + batch_size]
    normed_input_values, attentions = batching(batch_x)
    inputs = torch.from_numpy(normed_input_values.astype(np.float32)).cuda()
    attention_mask = torch.from_numpy(attentions).cuda()
    o_pt = model(inputs, attention_mask = attention_mask)
    o_pt = o_pt.logits.detach().cpu().numpy()
    o_pt = log_softmax(o_pt, axis = -1)
    pred_ids = np.argmax(o_pt, axis = -1)
    pred = tokenizer.batch_decode(pred_ids)
    for k in range(len(o_pt)):
        out = decoder.decode_beams(o_pt[k], prune_history=True)
        d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
        
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))
        
        wer_lm.append(calculate_wer(batch_y[k], d_lm2))
        cer_lm.append(calculate_cer(batch_y[k], d_lm2))

100%|██████████| 1240/1240 [04:23<00:00,  4.71it/s]


In [21]:
np.mean(wer), np.mean(cer), np.mean(wer_lm), np.mean(cer_lm)

(0.1322198446007387,
 0.0481054244857041,
 0.09880169127621556,
 0.041196586938584696)

In [22]:
index_malay = [no for no, i in enumerate(audio) if 'malay-test/' in i]
index_singlish = [no for no, i in enumerate(audio) if 'singlish-test/' in i]
index_mandarin = [no for no, i in enumerate(audio) if 'mandarin-test/' in i]

In [23]:
np.mean(np.array(wer)[index_malay]), np.mean(np.array(cer)[index_malay]), np.mean(np.array(wer_lm)[index_malay]), np.mean(np.array(cer_lm)[index_malay])

(0.19561999547293663,
 0.051636391937588406,
 0.12710746406824835,
 0.03917689630621449)

In [24]:
np.mean(np.array(wer)[index_singlish]), np.mean(np.array(cer)[index_singlish]), np.mean(np.array(wer_lm)[index_singlish]), np.mean(np.array(cer_lm)[index_singlish])

(0.12763802881676573,
 0.0494915200071987,
 0.09677160640413336,
 0.04271234986432335)

In [26]:
np.mean(np.array(wer)[index_mandarin]), np.mean(np.array(cer)[index_mandarin]), np.mean(np.array(wer_lm)[index_mandarin]), np.mean(np.array(cer_lm)[index_mandarin])

(0.07993515937860181,
 0.035626554824269824,
 0.07536807168546154,
 0.03487760945087219)

In [None]:
model.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')

Upload file pytorch_model.bin:   0%|          | 4.00k/1.18G [00:00<?, ?B/s]

In [None]:
model_tf = TFWav2Vec2ForCTC.from_pretrained(
    './checkpoint-115000',
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
    from_pt=True,
)

In [None]:
model_tf.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')

In [30]:
tokenizer = Wav2Vec2CTCTokenizer(
    "ctc-vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [31]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [32]:
processor.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed
   86e9f45..adf6534  main -> main



'https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed/commit/adf65347379e5902f7488753aef24d4e9d16daff'