{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "ZG_P29nKcSeI"
},
"source": [
"# HuggingFace challenge - Debugger notebook\n",
"Run this notebook to verify your libraries versions, check GPU config and run a quick training"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "YacvHugMc1Ka"
},
"outputs": [],
"source": [
"# %%capture\n",
"# !pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode\n",
"# !pip install datasets==1.18.3\n",
"# !pip install git+https://github.com/huggingface/transformers.git\n",
"# !pip install huggingface_hub==0.1\n",
"# !pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html\n",
"# !pip install jiwer\n",
"# !pip install -U git+https://github.com/huggingface/transformers.git"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vy63SoiZbnB5",
"outputId": "17391c60-b894-4571-b8a4-d46b18cb42e2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting git+https://github.com/huggingface/transformers.git\n",
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-i45amciw\n",
" Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-i45amciw\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (0.1.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (3.4.2)\n",
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (4.10.1)\n",
"Requirement already satisfied: tokenizers!=0.11.3,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (0.11.4)\n",
"Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (0.0.47)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (1.19.5)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (4.62.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (2.23.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (6.0)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (21.3)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.17.0.dev0) (2019.12.20)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.17.0.dev0) (3.10.0.2)\n",
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.17.0.dev0) (3.0.7)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.17.0.dev0) (3.7.0)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.17.0.dev0) (2.10)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.17.0.dev0) (2021.10.8)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.17.0.dev0) (3.0.4)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.17.0.dev0) (1.24.3)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.17.0.dev0) (1.15.0)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.17.0.dev0) (1.1.0)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.17.0.dev0) (7.1.2)\n"
]
}
],
"source": [
"# !pip install -U git+https://github.com/huggingface/transformers.git"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "T2utsYSKszvv"
},
"outputs": [],
"source": [
"import platform\n",
"import multiprocessing\n",
"\n",
"import torch\n",
"import transformers\n",
"import datasets\n",
"\n",
"import soundfile"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ejKNEyJEcSeO"
},
"source": [
"## Print main infos"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5P6I-W9ts-kR",
"outputId": "bd0c00d8-91c9-4b1a-8f2c-24182c2b227f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10\n",
"CPU cores: 60\n",
"Python version: 3.8.8\n",
"PyTorch version: 1.10.1+cu102\n",
"GPU is visible: True\n",
"Transformers version: 4.16.0.dev0\n",
"Datasets version: 1.18.3\n",
"soundfile version: 0.10.3\n"
]
}
],
"source": [
"print(f\"Platform: {platform.platform()}\")\n",
"print(f\"CPU cores: {multiprocessing.cpu_count()}\")\n",
"\n",
"print(f\"Python version: {platform.python_version()}\")\n",
"\n",
"print(f\"PyTorch version: {torch.__version__}\")\n",
"print(f\"GPU is visible: {torch.cuda.is_available()}\")\n",
"\n",
"print(f\"Transformers version: {transformers.__version__}\")\n",
"print(f\"Datasets version: {datasets.__version__}\")\n",
"\n",
"print(f\"soundfile version: {soundfile.__version__}\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_VUKw21PcSeQ"
},
"source": [
"## Check your GPU informations (if any)\n",
"If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).\n",
"Driver and CUDA version "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YT7fRnKctggU",
"outputId": "1fb2c851-11c3-4fcd-ad23-9032f25d7f8d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sat Jan 29 03:27:00 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla V100S-PCI... Off | 00000000:00:06.0 Off | 0 |\n",
"| N/A 35C P0 26W / 250W | 4MiB / 32510MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241,
"referenced_widgets": [
"50a1252082d942b09bfc620a9fa9d1d0",
"e270b7c82f784ebbbba4b17fb07c310d",
"32eb83bb6fd34c56bb345368e47e8f6f",
"34417f648cd54ed5b6d91f53af3e2713",
"7518572223ac480b89af2ab71f38b2ed",
"ce8bb7d0fb744e7b9ce2ff35cfdbc679",
"aa47a09bf444413ba95322d979c1908c",
"0b83a8775ea1441980d8ba945be752fe",
"127389ec566e423ab9a8f60a9d61caaa",
"4e4bc5550505497ba35f6bd7dde2893f",
"e5124c5171e04625b70795e4b7a18819",
"e410e7aecf23433f880a0f7169a8ce97",
"0f6b3cf1d33f46f594934874170bcd83",
"e549178ba75f4939aba6ae1cf743722a",
"9c28978adf974326a21259ae56f47fe9",
"7d3231a0b7794b11af662170b352d9e0"
]
},
"id": "3Wj2W4tWcSeR",
"outputId": "ad4eb63f-d643-45bd-b8d7-6adfefd9f773"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Login successful\n",
"Your token has been saved to /root/.huggingface/token\n",
"\u001b[1m\u001b[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.\n",
"You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default\n",
"\n",
"git config --global credential.helper store\u001b[0m\n"
]
}
],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "wHpUxFQPeWE2"
},
"outputs": [],
"source": [
"%%capture\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TorMtpwPv6RQ"
},
"source": [
"## Quick training run with a dummy model and data\n",
"more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fevoJD15u4Ss",
"outputId": "64745ecf-65b0-494d-a88d-52826eaae0f8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2022-01-28 09:12:30-- https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 31209 (30K) [text/plain]\n",
"Saving to: ‘run_speech_recognition_ctc.py’\n",
"\n",
"run_speech_recognit 100%[===================>] 30.48K --.-KB/s in 0.001s \n",
"\n",
"2022-01-28 09:12:30 (21.4 MB/s) - ‘run_speech_recognition_ctc.py’ saved [31209/31209]\n",
"\n"
]
}
],
"source": [
"!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py\n",
"# !wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XJRA51HjcSeT"
},
"outputs": [],
"source": [
"# \t--learning_rate=\"7.5e-5\" \\\n",
"# 84.5"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hZOB6ZAnsvDX",
"outputId": "7b6a85b5-950c-46a1-c005-b885f8a9bd17"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nvcc: NVIDIA (R) Cuda compiler driver\n",
"Copyright (c) 2005-2020 NVIDIA Corporation\n",
"Built on Mon_Oct_12_20:09:46_PDT_2020\n",
"Cuda compilation tools, release 11.1, V11.1.105\n",
"Build cuda_11.1.TC455_06.29190527_0\n"
]
}
],
"source": [
"!nvcc --version"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "NKlgW0E-sldT",
"outputId": "b925521a-29d2-4787-dd5b-6520dda688e4"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting bitsandbytes-cuda111\n",
" Downloading bitsandbytes_cuda111-0.26.0-py3-none-any.whl (4.0 MB)\n",
"\u001b[K |████████████████████████████████| 4.0 MB 4.3 MB/s \n",
"\u001b[?25hInstalling collected packages: bitsandbytes-cuda111\n",
"Successfully installed bitsandbytes-cuda111-0.26.0\n"
]
}
],
"source": [
"!pip install bitsandbytes-cuda111"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"remove special characters from datasets: 2558ex [00:00, 4238.92ex/s]\n",
"remove special characters from datasets: 1184ex [00:00, 5236.89ex/s]\n",
"loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n",
"Model config Wav2Vec2Config {\n",
" \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n",
" \"activation_dropout\": 0.0,\n",
" \"adapter_kernel_size\": 3,\n",
" \"adapter_stride\": 2,\n",
" \"add_adapter\": false,\n",
" \"apply_spec_augment\": true,\n",
" \"architectures\": [\n",
" \"Wav2Vec2ForPreTraining\"\n",
" ],\n",
" \"attention_dropout\": 0.1,\n",
" \"bos_token_id\": 1,\n",
" \"classifier_proj_size\": 256,\n",
" \"codevector_dim\": 768,\n",
" \"contrastive_logits_temperature\": 0.1,\n",
" \"conv_bias\": true,\n",
" \"conv_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512\n",
" ],\n",
" \"conv_kernel\": [\n",
" 10,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"conv_stride\": [\n",
" 5,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"ctc_loss_reduction\": \"sum\",\n",
" \"ctc_zero_infinity\": false,\n",
" \"diversity_loss_weight\": 0.1,\n",
" \"do_stable_layer_norm\": true,\n",
" \"eos_token_id\": 2,\n",
" \"feat_extract_activation\": \"gelu\",\n",
" \"feat_extract_dropout\": 0.0,\n",
" \"feat_extract_norm\": \"layer\",\n",
" \"feat_proj_dropout\": 0.1,\n",
" \"feat_quantizer_dropout\": 0.0,\n",
" \"final_dropout\": 0.0,\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout\": 0.1,\n",
" \"hidden_size\": 1024,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 4096,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"layerdrop\": 0.1,\n",
" \"mask_feature_length\": 10,\n",
" \"mask_feature_min_masks\": 0,\n",
" \"mask_feature_prob\": 0.0,\n",
" \"mask_time_length\": 10,\n",
" \"mask_time_min_masks\": 2,\n",
" \"mask_time_prob\": 0.075,\n",
" \"model_type\": \"wav2vec2\",\n",
" \"num_adapter_layers\": 3,\n",
" \"num_attention_heads\": 16,\n",
" \"num_codevector_groups\": 2,\n",
" \"num_codevectors_per_group\": 320,\n",
" \"num_conv_pos_embedding_groups\": 16,\n",
" \"num_conv_pos_embeddings\": 128,\n",
" \"num_feat_extract_layers\": 7,\n",
" \"num_hidden_layers\": 24,\n",
" \"num_negatives\": 100,\n",
" \"output_hidden_size\": 1024,\n",
" \"pad_token_id\": 0,\n",
" \"proj_codevector_dim\": 768,\n",
" \"tdnn_dilation\": [\n",
" 1,\n",
" 2,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"tdnn_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 1500\n",
" ],\n",
" \"tdnn_kernel\": [\n",
" 5,\n",
" 3,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.16.0.dev0\",\n",
" \"use_weighted_layer_sum\": false,\n",
" \"vocab_size\": 32,\n",
" \"xvector_output_dim\": 512\n",
"}\n",
"\n",
"100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.20ba/s]\n",
"100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 31.05ba/s]\n",
"Didn't find file ./wav2vec2-large-xls-r-300m-slovenian/tokenizer.json. We won't load it.\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/vocab.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/tokenizer_config.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/added_tokens.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/special_tokens_map.json\n",
"loading file None\n",
"Adding to the vocabulary\n",
"Adding to the vocabulary\n",
"loading configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/config.json from cache at /workspace/.cache/huggingface/transformers/dabc27df63e37bd2a7a221c7774e35f36a280fbdf917cf54cadfc7df8c786f6f.a3e4c3c967d9985881e0ae550a5f6f668f897db5ab2e0802f9b97973b15970e6\n",
"Model config Wav2Vec2Config {\n",
" \"_name_or_path\": \"facebook/wav2vec2-xls-r-300m\",\n",
" \"activation_dropout\": 0.0,\n",
" \"adapter_kernel_size\": 3,\n",
" \"adapter_stride\": 2,\n",
" \"add_adapter\": false,\n",
" \"apply_spec_augment\": true,\n",
" \"architectures\": [\n",
" \"Wav2Vec2ForPreTraining\"\n",
" ],\n",
" \"attention_dropout\": 0.1,\n",
" \"bos_token_id\": 1,\n",
" \"classifier_proj_size\": 256,\n",
" \"codevector_dim\": 768,\n",
" \"contrastive_logits_temperature\": 0.1,\n",
" \"conv_bias\": true,\n",
" \"conv_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512\n",
" ],\n",
" \"conv_kernel\": [\n",
" 10,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"conv_stride\": [\n",
" 5,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"ctc_loss_reduction\": \"sum\",\n",
" \"ctc_zero_infinity\": false,\n",
" \"diversity_loss_weight\": 0.1,\n",
" \"do_stable_layer_norm\": true,\n",
" \"eos_token_id\": 2,\n",
" \"feat_extract_activation\": \"gelu\",\n",
" \"feat_extract_dropout\": 0.0,\n",
" \"feat_extract_norm\": \"layer\",\n",
" \"feat_proj_dropout\": 0.1,\n",
" \"feat_quantizer_dropout\": 0.0,\n",
" \"final_dropout\": 0.0,\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout\": 0.1,\n",
" \"hidden_size\": 1024,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 4096,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"layerdrop\": 0.1,\n",
" \"mask_feature_length\": 10,\n",
" \"mask_feature_min_masks\": 0,\n",
" \"mask_feature_prob\": 0.0,\n",
" \"mask_time_length\": 10,\n",
" \"mask_time_min_masks\": 2,\n",
" \"mask_time_prob\": 0.075,\n",
" \"model_type\": \"wav2vec2\",\n",
" \"num_adapter_layers\": 3,\n",
" \"num_attention_heads\": 16,\n",
" \"num_codevector_groups\": 2,\n",
" \"num_codevectors_per_group\": 320,\n",
" \"num_conv_pos_embedding_groups\": 16,\n",
" \"num_conv_pos_embeddings\": 128,\n",
" \"num_feat_extract_layers\": 7,\n",
" \"num_hidden_layers\": 24,\n",
" \"num_negatives\": 100,\n",
" \"output_hidden_size\": 1024,\n",
" \"pad_token_id\": 0,\n",
" \"proj_codevector_dim\": 768,\n",
" \"tdnn_dilation\": [\n",
" 1,\n",
" 2,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"tdnn_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 1500\n",
" ],\n",
" \"tdnn_kernel\": [\n",
" 5,\n",
" 3,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.16.0.dev0\",\n",
" \"use_weighted_layer_sum\": false,\n",
" \"vocab_size\": 32,\n",
" \"xvector_output_dim\": 512\n",
"}\n",
"\n",
"loading feature extractor configuration file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/preprocessor_config.json from cache at /workspace/.cache/huggingface/transformers/6fb028b95b394059e7d3b367bbca2382b576c66aebe896f04d2cd34e1b575f5b.d4484dc1c81456a2461485e7168b04347a7b9a4e3b1ef3aba723323b33e12326\n",
"Feature extractor Wav2Vec2FeatureExtractor {\n",
" \"do_normalize\": true,\n",
" \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n",
" \"feature_size\": 1,\n",
" \"padding_side\": \"right\",\n",
" \"padding_value\": 0,\n",
" \"return_attention_mask\": true,\n",
" \"sampling_rate\": 16000\n",
"}\n",
"\n",
"loading weights file https://huggingface.co/facebook/wav2vec2-xls-r-300m/resolve/main/pytorch_model.bin from cache at /workspace/.cache/huggingface/transformers/1e6a6507f3b689035cd4b247e2a37c154e27f39143f31357a49b4e38baeccc36.1edb32803799e27ed554eb7dd935f6745b1a0b17b0ea256442fe24db6eb546cd\n",
"Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_q.bias', 'project_hid.weight', 'quantizer.weight_proj.bias']\n",
"- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"preprocess datasets: 2558ex [00:18, 135.52ex/s]\n",
"preprocess datasets: 1184ex [00:09, 123.58ex/s]\n",
"100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 604.02ba/s]\n",
"100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 849.91ba/s]\n",
"Configuration saved in ./wav2vec2-large-xls-r-300m-slovenian/preprocessor_config.json\n",
"tokenizer config file saved in ./wav2vec2-large-xls-r-300m-slovenian/tokenizer_config.json\n",
"Special tokens file saved in ./wav2vec2-large-xls-r-300m-slovenian/special_tokens_map.json\n",
"added tokens file saved in ./wav2vec2-large-xls-r-300m-slovenian/added_tokens.json\n",
"Configuration saved in ./wav2vec2-large-xls-r-300m-slovenian/config.json\n",
"loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-slovenian/preprocessor_config.json\n",
"loading configuration file ./wav2vec2-large-xls-r-300m-slovenian/config.json\n",
"Model config Wav2Vec2Config {\n",
" \"_name_or_path\": \"./wav2vec2-large-xls-r-300m-slovenian\",\n",
" \"activation_dropout\": 0.1,\n",
" \"adapter_kernel_size\": 3,\n",
" \"adapter_stride\": 2,\n",
" \"add_adapter\": false,\n",
" \"apply_spec_augment\": true,\n",
" \"architectures\": [\n",
" \"Wav2Vec2ForPreTraining\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 1,\n",
" \"classifier_proj_size\": 256,\n",
" \"codevector_dim\": 768,\n",
" \"contrastive_logits_temperature\": 0.1,\n",
" \"conv_bias\": true,\n",
" \"conv_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512\n",
" ],\n",
" \"conv_kernel\": [\n",
" 10,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 3,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"conv_stride\": [\n",
" 5,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2,\n",
" 2\n",
" ],\n",
" \"ctc_loss_reduction\": \"mean\",\n",
" \"ctc_zero_infinity\": false,\n",
" \"diversity_loss_weight\": 0.1,\n",
" \"do_stable_layer_norm\": true,\n",
" \"eos_token_id\": 2,\n",
" \"feat_extract_activation\": \"gelu\",\n",
" \"feat_extract_dropout\": 0.0,\n",
" \"feat_extract_norm\": \"layer\",\n",
" \"feat_proj_dropout\": 0.0,\n",
" \"feat_quantizer_dropout\": 0.0,\n",
" \"final_dropout\": 0.0,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout\": 0.0,\n",
" \"hidden_size\": 1024,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 4096,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"layerdrop\": 0.0,\n",
" \"mask_feature_length\": 64,\n",
" \"mask_feature_min_masks\": 0,\n",
" \"mask_feature_prob\": 0.25,\n",
" \"mask_time_length\": 10,\n",
" \"mask_time_min_masks\": 2,\n",
" \"mask_time_prob\": 0.75,\n",
" \"model_type\": \"wav2vec2\",\n",
" \"num_adapter_layers\": 3,\n",
" \"num_attention_heads\": 16,\n",
" \"num_codevector_groups\": 2,\n",
" \"num_codevectors_per_group\": 320,\n",
" \"num_conv_pos_embedding_groups\": 16,\n",
" \"num_conv_pos_embeddings\": 128,\n",
" \"num_feat_extract_layers\": 7,\n",
" \"num_hidden_layers\": 24,\n",
" \"num_negatives\": 100,\n",
" \"output_hidden_size\": 1024,\n",
" \"pad_token_id\": 32,\n",
" \"proj_codevector_dim\": 768,\n",
" \"tdnn_dilation\": [\n",
" 1,\n",
" 2,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"tdnn_dim\": [\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 512,\n",
" 1500\n",
" ],\n",
" \"tdnn_kernel\": [\n",
" 5,\n",
" 3,\n",
" 3,\n",
" 1,\n",
" 1\n",
" ],\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.16.0.dev0\",\n",
" \"use_weighted_layer_sum\": false,\n",
" \"vocab_size\": 35,\n",
" \"xvector_output_dim\": 512\n",
"}\n",
"\n",
"loading feature extractor configuration file ./wav2vec2-large-xls-r-300m-slovenian/preprocessor_config.json\n",
"Feature extractor Wav2Vec2FeatureExtractor {\n",
" \"do_normalize\": true,\n",
" \"feature_extractor_type\": \"Wav2Vec2FeatureExtractor\",\n",
" \"feature_size\": 1,\n",
" \"padding_side\": \"right\",\n",
" \"padding_value\": 0,\n",
" \"return_attention_mask\": true,\n",
" \"sampling_rate\": 16000\n",
"}\n",
"\n",
"Didn't find file ./wav2vec2-large-xls-r-300m-slovenian/tokenizer.json. We won't load it.\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/vocab.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/tokenizer_config.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/added_tokens.json\n",
"loading file ./wav2vec2-large-xls-r-300m-slovenian/special_tokens_map.json\n",
"loading file None\n",
"Adding to the vocabulary\n",
"Adding to the vocabulary\n",
"/workspace/votic_training/./wav2vec2-large-xls-r-300m-slovenian is already a clone of https://huggingface.co/infinitejoy/wav2vec2-large-xls-r-300m-slovenian. Make sure you pull the latest changes with `repo.git_pull()`.\n",
"Using amp half precision backend\n",
"The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
"/opt/conda/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"***** Running training *****\n",
" Num examples = 2558\n",
" Num Epochs = 100\n",
" Instantaneous batch size per device = 32\n",
" Total train batch size (w. parallel, distributed & accumulation) = 32\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 8000\n",
" 7%|██▋ | 577/8000 [13:11<2:08:41, 1.04s/it]"
]
}
],
"source": [
"!python run_speech_recognition_ctc.py \\\n",
"\t--dataset_name=\"mozilla-foundation/common_voice_7_0\" \\\n",
"\t--model_name_or_path=\"facebook/wav2vec2-xls-r-300m\" \\\n",
"\t--dataset_config_name=\"sl\" \\\n",
"\t--output_dir=\"./wav2vec2-large-xls-r-300m-slovenian\" \\\n",
"\t--overwrite_output_dir \\\n",
"\t--num_train_epochs=\"100\" \\\n",
"\t--per_device_train_batch_size=\"32\" \\\n",
"\t--per_device_eval_batch_size=\"16\" \\\n",
"\t--gradient_accumulation_steps=\"1\" \\\n",
"\t--learning_rate=\"7e-5\" \\\n",
"\t--warmup_steps=\"1000\" \\\n",
"\t--length_column_name=\"input_length\" \\\n",
"\t--evaluation_strategy=\"steps\" \\\n",
"\t--text_column_name=\"sentence\" \\\n",
"\t--chars_to_ignore , ? . ! \\- \\; \\: \\\" “ % ‘ ” � — ’ … – \\' \\\n",
"\t--save_steps=\"1000\" \\\n",
"\t--eval_steps=\"1000\" \\\n",
"\t--logging_steps=\"100\" \\\n",
"\t--layerdrop=\"0.0\" \\\n",
"\t--activation_dropout=\"0.1\" \\\n",
"\t--save_total_limit=\"2\" \\\n",
"\t--freeze_feature_encoder \\\n",
"\t--feat_proj_dropout=\"0.0\" \\\n",
"\t--mask_time_prob=\"0.75\" \\\n",
"\t--mask_time_length=\"10\" \\\n",
"\t--mask_feature_prob=\"0.25\" \\\n",
"\t--mask_feature_length=\"64\" \\\n",
"\t--gradient_checkpointing \\\n",
"\t--use_auth_token \\\n",
"\t--fp16 \\\n",
"\t--group_by_length \\\n",
"\t--do_train --do_eval \\\n",
" --push_to_hub > out.log"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0zBb4QMVcSeV"
},
"outputs": [],
"source": [
"# !rm -rf wav2vec2-large-xls-r-300m-bashkir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jxvhTTQ2cSeV"
},
"outputs": [],
"source": [
"!ls -ltr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "okCO9-XTcSeV",
"outputId": "a47bb25e-904a-4c1e-8871-d996a16b6bcc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filesystem Size Used Avail Use% Mounted on\n",
"overlay 3.5T 1.2T 2.2T 34% /\n",
"tmpfs 64M 0 64M 0% /dev\n",
"tmpfs 87G 0 87G 0% /sys/fs/cgroup\n",
"tmpfs 87G 0 87G 0% /dev/shm\n",
"/dev/md0 3.5T 1.2T 2.2T 34% /etc/group\n",
"tmpfs 87G 12K 87G 1% /proc/driver/nvidia\n",
"/dev/vda1 49G 6.5G 42G 14% /usr/bin/nvidia-smi\n",
"udev 87G 0 87G 0% /dev/nvidia0\n",
"tmpfs 87G 0 87G 0% /proc/acpi\n",
"tmpfs 87G 0 87G 0% /proc/scsi\n",
"tmpfs 87G 0 87G 0% /sys/firmware\n"
]
}
],
"source": [
"!df -h"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "axSDvjOMdkxW"
},
"outputs": [],
"source": [
"# !pip install -U datasets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238,
"referenced_widgets": [
"7c34d36b28e54989b0c509eae1bd9a0f",
"eba629a92467433c92840e4450e7a937",
"cf1afb1025d24c1cbbb1eefd26535a26",
"f347c0838adf462d886a4ae36a3a6b41",
"37bdb17bf4734fd4b92759c874a4d4b8",
"4685ef4f82764fada48035b4de9af9e2",
"aab799184cf8453e9cf026a32abff619",
"1795d07714684311b1ccea7514f298e4",
"7fa8f65c508e4e629b1a2212aaa64ebc",
"c139ed75ff4d47d593f8cb5f3fa4c105",
"776dc15d8836456281084dc154d769e4",
"f3a862eb1219484b8d9381fb0d16b063",
"da3f94cc1140466cbcbdb3e03cbea8c2",
"2fedf1edcc184d9b8c67712511f8bfef",
"25142b9649ef403c8b37cdb7f9a8de4b",
"8f5cd0e3111241b8a61914dac82acf73",
"7340567ea42d42709f8099a249f6b5dd",
"7365cf85ddff4b26a27c9b797c573949",
"2fbc062ac19f4eb7a8adff2a5118bea4",
"ae5b0f9f37e44e8e965f7e20dfdf3bfa",
"24aeaf260d2240d08466c5e3a01d95cb",
"06ec543be0a34943959c3140119c4d6e",
"311cbd6bf6df4c35b7819e49fb55a562",
"3bc2760daaa346b2b20d76d6cf4ed336",
"c4b226675ad84ff29f62847767065469",
"0be3f91b1071464d979c0c59baff32f4",
"7c4a653d81474818b084b71657f71e0f",
"cb10ec01c16a4c50bf8e4c8aec491aa2",
"ec67f65de50b4038ac3b01496ef56f98",
"4b2562825d8e4c5484008cd054e01216",
"209d975f5d4e4300bf01bb6b2472d493",
"690f71c3c232421c8cd92a28b5435b55",
"4f4d422bdd49486c940713c19e754479",
"e5d1a213afc04270926da41e12b30362",
"30afb513746845b481227b3191df4c90",
"c7017ddc94104c27b42658f27f275908",
"155de8f44ddf4021a5d1d4d4968934db",
"cb3b32862a12486f8625d667bb45c368",
"832b4fcaf152402e84bfdaf9833d061f",
"8af6a305cc8a4a038f74f39e6ea8f040",
"4c316c3eddd64af1b4d892516e1ced03",
"efd0fc9b3766457484533a6eb59f2cd4",
"27d72d36fe604e5d96d6a979ed6d50ee",
"f90669ec059249ca81a0e2c5891834db",
"67d3fcb0869a4485b24846d3b1e34fca",
"3db73d64f4e54cad8f8cd0f5facc33c0",
"d434124da4654ada92573070353dbce1",
"3c36f662c44e453ca935753e6dc18060",
"0d0ab06d275d49f5b1ac57b28c53c158",
"61771b0bdfe543b88fc8673a510a986c",
"63d4b794d9df49c6ab6f77f10a76861d",
"42bb543380e14d859f42e966b3c54bc2",
"00a1878e3cda42e1982093e185935937",
"9cce7704e9e74588aa7aa3b9ddf9672f",
"a27c1dd0b5c447058bf8abde274d7085",
"1ee70ac9891d4104ad801f75b4081c9f",
"eda7343054624f4d8a2e2b981b4fab41",
"f56579df97b94a5a8b3a0fbf32905687",
"aee17658cd4b4fe49a759ad6c9d5a576",
"3a6e34083c8f4066a6718c957958cfa6",
"8148f4330d0f441998d9a3ca4942bc22",
"9ea974dfe1184fe3897a7d9d031c7624",
"a968de55d2e148f88084ac96444c17ee",
"c0aeab2086de4ca7ad8b5f0bbcde009c",
"05d04f345a3148dd9053a5d524592333",
"7a68ba6f90a24162a973ba5146c2f546",
"a4411af1dda24dec9b863793ccd22390",
"f085643a56b94b74bb7e883598170f01",
"ee8a677f68a147e5b10a35518616e264",
"315ae5446f264660bbe6119e8261495d",
"64b970adf3af40268fb60e38140157e2",
"2ac4df7918404aed92611750471cd85f",
"7bf164fec94c40858cf5280937f8e00a",
"0e1672eeb5244df9bf0cbd095625d68a",
"ee80362b77ef4375bb931af34bc16d07",
"fed5fdea500f46618789c44aef2bff3b",
"f49c5c9c58ee482a8264e422d4610a8a",
"6a9e0e280ef7493eb4557429d6f53685",
"c51fb67419ed47f98c5ed4ad4e33aeff",
"2de6d3927c534397ab122a9cf6332a33",
"f3891dcc62b74ccd8d5a61b0ca761b2a",
"9958cd546fbe477092527a14bb3bfe21",
"639f180d5e02425dba7d4c4bca07c59b",
"4da0d9054bd74fb2a77bb40371c99a7b",
"3f8a5e226fbf4175b4fa7f39a2a9d290",
"41515b22976648aabe660b8df3506c4c",
"b2a72b0caf104aee8dd95bff01cc52a4",
"6b8769a26838449e9d7d45fc5cc7a6f6",
"50862512d9c14dbd92f8cc3d795d4cd2",
"352fc0a527024af8a284c53f4d521fec",
"67653ac95966464994b1e0a889cfc5d9",
"778d0a9a7de243eba8dd1c0caf3aa82e",
"14eb779636914797867b7315f347839d",
"25a5802292874e49bb42a1489ff54b31",
"89a05d4149534d78935e169c6623f458",
"49f46100f43346d2bdb402e2fd1a1951",
"5e2e7ad6aa8f4f51adf7f6376b84f618",
"2e918f153be0489dbf0ad64bc45c563c",
"c319fa946f3e4380864aed6d3fbb77e7"
]
},
"id": "82uZWUF_cSeW",
"outputId": "e78215f2-d452-4d92-a94c-0a469f8760d4"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/sl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n",
"Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/sl/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2558\n"
]
}
],
"source": [
"from datasets import load_dataset, load_metric, Audio\n",
"\n",
"common_voice_train = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"sl\", use_auth_token=True, split=\"train+validation\")\n",
"common_voice_test = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"sl\", use_auth_token=True, split=\"test\")\n",
"\n",
"print(len(common_voice_train))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1Qa9wKa4cSeW",
"outputId": "da721286-89ac-421c-a269-e779449488c6"
},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
" num_rows: 2558\n",
"})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_voice_train"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "H_KRIMbEcSeX",
"outputId": "90601843-d465-4cd3-dff0-9d2302e02699"
},
"outputs": [
{
"data": {
"text/plain": [
"7993.75"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(common_voice_train) * 100 / 32"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "ZUc_UAMbcSeX"
},
"outputs": [],
"source": [
"common_voice_train = common_voice_train.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])\n",
"common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "OKxWKzjMcSeX"
},
"outputs": [],
"source": [
"from datasets import ClassLabel\n",
"import random\n",
"import pandas as pd\n",
"from IPython.display import display, HTML\n",
"\n",
"def show_random_elements(dataset, num_examples=10):\n",
" assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n",
" picks = []\n",
" for _ in range(num_examples):\n",
" pick = random.randint(0, len(dataset)-1)\n",
" while pick in picks:\n",
" pick = random.randint(0, len(dataset)-1)\n",
" picks.append(pick)\n",
" \n",
" df = pd.DataFrame(dataset[picks])\n",
" display(HTML(df.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"id": "uR3e--0AcSeY",
"outputId": "efb84606-2717-4040-ca02-86975a2f4824"
},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | sentence | \n", "
---|---|
0 | \n", "Njegove oči so večje od njegovega trebuha. | \n", "
1 | \n", "Cesar lahko postane vsak berač, če ga le ljudstvo izvoli. | \n", "
2 | \n", "Ne, zelene. | \n", "
3 | \n", "Tišina je manj škodljiva kot slab odgovor. | \n", "
4 | \n", "Vsega je konec. | \n", "
5 | \n", "Imam video posnetek mojega včerajšnjega pevskega nastopa. | \n", "
6 | \n", "Dokler sreča pomaga, se da tudi proti toku plavati. | \n", "
7 | \n", "Prijatelje hitreje izgubimo, kot dobimo. | \n", "
8 | \n", "Stara mera, stara vera. | \n", "
9 | \n", "Kislina hudo opeče kožo. | \n", "