In [1]:
!pip install datasets==1.18.3
!pip install transformers
!pip install pyctcdecode
!pip install jiwer
!pip install https://github.com/kpu/kenlm/archive/master.zip
!huggingface-cli login

Collecting datasets==1.18.3
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[?25l[K     |█                               | 10 kB 27.7 MB/s eta 0:00:01[K     |██                              | 20 kB 17.0 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.7 MB/s eta 0:00:01[K     |████▏                           | 40 kB 8.9 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 8.5 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 8.8 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 6.3 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 7.0 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 7.6 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 8.2 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 8.2 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 8.2 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 8.2 MB/s eta 

In [2]:
import torch
import torchaudio
from datasets import load_dataset, load_metric,  Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
import re
wer = load_metric("wer")
cer = load_metric("cer")


chars_to_ignore_regex = '[\é\！\，\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\’\—\–\·]'

def load_data(dataset_id, language, split='test'):
    test_dataset = load_dataset(dataset_id, language, split=split, use_auth_token=True)
    test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
    return test_dataset

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
    batch["sentence"] = re.sub('!', '', batch["sentence"]).lower() + " "
    batch["sentence"] = batch["sentence"].replace('\"',"").replace("&","").replace("'","").replace("(","").lower() + " "
    batch["sentence"] = batch["sentence"].replace('[',"").replace("]","").replace("\\","").replace("«","").replace("»","").replace(")","").lower() + " "
    batch["sentence"] = batch["sentence"].replace("  "," ").replace("  "," ").replace("  "," ").lower() + " "
    
    batch["speech"] = batch["audio"]["array"]
    return batch


def evaluate_with_lm(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs.to('cuda')).logits
    int_result = processor.batch_decode(logits.cpu().numpy())

    batch["pred_strings"] =  int_result.text

    del int_result
    torch.cuda.empty_cache()

    return batch

def evaluate(batch):

    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to('cuda')).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
    return batch


Downloading:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [3]:
import torch
import torchaudio
from datasets import load_dataset, load_metric,  Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
import re

language="mn"
model_id = 'ayameRushia/wav2vec2-large-xls-r-300m-mn'
dataset_id = "mozilla-foundation/common_voice_8_0"
split="test"

processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_id,use_auth_token=True)
model = Wav2Vec2ForCTC.from_pretrained(model_id,use_auth_token=True)
model.to('cuda')

test_dataset = load_data(dataset_id, language, split)
test_dataset = test_dataset.map(speech_file_to_array_fn)

result = test_dataset.map(evaluate_with_lm, batched=True, batch_size=4)

print("WER: {:2f}".format(wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
print("CER: {:2f}".format(cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/374 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/186k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.1k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/mn to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/mn/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8...


Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/mn/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8. Subsequent calls will reuse this data.


0ex [00:00, ?ex/s]

  0%|          | 0/471 [00:00<?, ?ba/s]

WER: 0.313919
CER: 0.102565


Delete previous model and clear the cuda cache

In [4]:
del model
torch.cuda.empty_cache()
!nvidia-smi

Mon Feb  7 01:24:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    40W / 250W |   2483MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
dataset_id = "mozilla-foundation/common_voice_8_0"
language="mn"
split="test"
model_id = 'ayameRushia/wav2vec2-large-xls-r-300m-mn'

processor = Wav2Vec2Processor.from_pretrained(model_id,use_auth_token=True)
model = Wav2Vec2ForCTC.from_pretrained(model_id,use_auth_token=True)
model.to("cuda")

test_dataset = load_data(dataset_id, language, split)
test_dataset = test_dataset.map(speech_file_to_array_fn)

result = test_dataset.map(evaluate, batched=True, batch_size=4)

print("WER: {:2f}".format(wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
print("CER: {:2f}".format(cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Reusing dataset common_voice (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice/mn/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)
Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/mn/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-93c5dc7ad9262723.arrow


  0%|          | 0/471 [00:00<?, ?ba/s]

WER: 0.582171
CER: 0.160670
