Source )

Browse files

Files changed (14) hide show

src/EDA.ipynb +391 -0
src/MITI.ipynb +342 -0
src/download_quantized.py +132 -0
src/laboratory.ipynb +0 -0
src/lora_tuning.py +773 -0
src/merge_lora.py +44 -0
src/prepare_data.py +212 -0
src/realtime.py +157 -0
src/requirements.txt +15 -0
src/test_whisper.ipynb +1546 -0
src/training.py +183 -0
src/vin_whisper_medium.ipynb +1164 -0
src/whisperX.ipynb +131 -0
src/whisper_quant.py +995 -0

src/EDA.ipynb ADDED Viewed

	@@ -0,0 +1,391 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "\n",
+    "def count_files_by_extension(path, extension):\n",
+    "    \"\"\"\n",
+    "    path : root path to check ,\n",
+    "    extension : .wav , ...\n",
+    "    \"\"\"\n",
+    "\n",
+    "    files = glob.glob(f\"{path}/*.{extension}\")\n",
+    "    return len(files)\n",
+    "\n",
+    "\n",
+    "root_path = \"./vin_data/vlsp2020_train_set_02/\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_wav_files = count_files_by_extension(root_path, \"wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_txt_files = count_files_by_extension(root_path, \"txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Số lượng file WAV: 56427\n",
+      "Số lượng file text: 56427\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Số lượng file WAV: {num_wav_files}\")\n",
+    "print(f\"Số lượng file text: {num_txt_files}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tần số mẫu (sample rate): 16000 Hz\n",
+      "Số kênh (channels): 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "import wave\n",
+    "\n",
+    "\n",
+    "def get_random_wav_file_info(folder_path):\n",
+    "    wav_files = glob.glob(f\"{folder_path}/*.wav\")\n",
+    "    \n",
+    "    if not wav_files:\n",
+    "        return None, None\n",
+    "    \n",
+    "    random_wav_file = random.choice(wav_files)\n",
+    "    \n",
+    "    with wave.open(random_wav_file, 'rb') as wav_file:\n",
+    "        sample_rate = wav_file.getframerate()\n",
+    "        channels = wav_file.getnchannels()\n",
+    "    \n",
+    "    return sample_rate, channels\n",
+    "\n",
+    "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n",
+    "\n",
+    "sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)\n",
+    "\n",
+    "if sample_rate is not None and channels is not None:\n",
+    "    print(f\"Tần số mẫu (sample rate): {sample_rate} Hz\")\n",
+    "    print(f\"Số kênh (channels): {channels}\")\n",
+    "else:\n",
+    "    print(\"Nothing.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "def create_csv_from_wav_folder(folder_path, output_csv_file):\n",
+    "    wav_files = glob.glob(f\"{folder_path}/*.wav\")\n",
+    "\n",
+    "    if not wav_files:\n",
+    "        print(\"Không có file WAV nào trong thư mục.\")\n",
+    "        return\n",
+    "\n",
+    "    # Mở tệp CSV đầu ra và tạo bộ đếm số lượng file WAV\n",
+    "    with open(output_csv_file, mode='w', newline='') as csv_file:\n",
+    "        csv_writer = csv.writer(csv_file)\n",
+    "        csv_writer.writerow(['path', 'name','sentence'])\n",
+    "\n",
+    "        for wav_file_path in tqdm(wav_files):\n",
+    "\n",
+    "            text_file_path = os.path.splitext(wav_file_path)[0] + \".txt\"\n",
+    "            if os.path.exists(text_file_path):\n",
+    "                with open(text_file_path, 'r') as txt_file:\n",
+    "                    text_content = txt_file.read()\n",
+    "            else:\n",
+    "                text_content = \"Not found.\"\n",
+    "\n",
+    "            csv_writer.writerow([wav_file_path, os.path.basename(wav_file_path), sample_rate, channels, text_content])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 56427/56427 [00:37<00:00, 1492.44it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_csv_file = \"vin.csv\"\n",
+    "path_to_wav_folder = \"./vin_data/vlsp2020_train_set_02/\"\n",
+    "create_csv_from_wav_folder(path_to_wav_folder, output_csv_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>path</th>\n",
+       "      <th>name</th>\n",
+       "      <th>sentence</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>./vin_data/vlsp2020_train_set_02/spkyut-201907...</td>\n",
+       "      <td>spkyut-20190730-utt000000716.wav</td>\n",
+       "      <td>cây cam canh là loại cây ăn quả dễ trồng dễ ch...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>./vin_data/vlsp2020_train_set_02/database_sa3_...</td>\n",
+       "      <td>database_sa3_1_150h_15Jan2020_cleaned_utt_0000...</td>\n",
+       "      <td>những đặc sản vùng miền nổi tiếng như miến don...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>./vin_data/vlsp2020_train_set_02/speaker_544-0...</td>\n",
+       "      <td>speaker_544-069450-1.wav</td>\n",
+       "      <td>trước thông tin này trương nam thành chia sẻ c...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>./vin_data/vlsp2020_train_set_02/database_sa1_...</td>\n",
+       "      <td>database_sa1_Jan08_Mar19_cleaned_utt_000005361...</td>\n",
+       "      <td>giống như những nữ hoàng á</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>./vin_data/vlsp2020_train_set_02/database_sa2_...</td>\n",
+       "      <td>database_sa2_Jan4_Feb29_cleaned_utt_0000154206...</td>\n",
+       "      <td>thay vì phun toàn bộ cánh đồng bằng hóa chất c...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                path  \\\n",
+       "0  ./vin_data/vlsp2020_train_set_02/spkyut-201907...   \n",
+       "1  ./vin_data/vlsp2020_train_set_02/database_sa3_...   \n",
+       "2  ./vin_data/vlsp2020_train_set_02/speaker_544-0...   \n",
+       "3  ./vin_data/vlsp2020_train_set_02/database_sa1_...   \n",
+       "4  ./vin_data/vlsp2020_train_set_02/database_sa2_...   \n",
+       "\n",
+       "                                                name  \\\n",
+       "0                   spkyut-20190730-utt000000716.wav   \n",
+       "1  database_sa3_1_150h_15Jan2020_cleaned_utt_0000...   \n",
+       "2                           speaker_544-069450-1.wav   \n",
+       "3  database_sa1_Jan08_Mar19_cleaned_utt_000005361...   \n",
+       "4  database_sa2_Jan4_Feb29_cleaned_utt_0000154206...   \n",
+       "\n",
+       "                                            sentence  \n",
+       "0  cây cam canh là loại cây ăn quả dễ trồng dễ ch...  \n",
+       "1  những đặc sản vùng miền nổi tiếng như miến don...  \n",
+       "2  trước thông tin này trương nam thành chia sẻ c...  \n",
+       "3                         giống như những nữ hoàng á  \n",
+       "4  thay vì phun toàn bộ cánh đồng bằng hóa chất c...  "
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd \n",
+    "data = pd.read_csv('vin_test.csv')\n",
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import random\n",
+    "\n",
+    "def split_csv_file(input_file, output_file1, output_file2, ratio):\n",
+    "    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:\n",
+    "        csvreader = csv.reader(csvfile)\n",
+    "        header = next(csvreader) \n",
+    "        \n",
+    "        data = list(csvreader)\n",
+    "        random.shuffle(data)\n",
+    "\n",
+    "        total_rows = len(data)\n",
+    "        rows_output_file1 = int(total_rows * ratio)\n",
+    "        rows_output_file2 = total_rows - rows_output_file1\n",
+    "        \n",
+    "        # Split the data into two parts\n",
+    "        data1 = data[:rows_output_file1]\n",
+    "        data2 = data[rows_output_file1:]\n",
+    "\n",
+    "    with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:\n",
+    "        csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
+    "        csvwriter1.writerow(header)\n",
+    "        csvwriter1.writerows(data1)\n",
+    "\n",
+    "    with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:\n",
+    "        csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
+    "        csvwriter2.writerow(header)\n",
+    "        csvwriter2.writerows(data2)\n",
+    "\n",
+    "input_file = 'vin.csv'\n",
+    "output_file1 = 'vin_train.csv'\n",
+    "output_file2 = 'vin_test.csv'\n",
+    "ratio = 0.8  \n",
+    "\n",
+    "split_csv_file(input_file, output_file1, output_file2, ratio)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, DatasetDict\n",
+    "\n",
+    "vivos = DatasetDict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "\n",
+    "import pandas as pd\n",
+    "import whisper\n",
+    "import torchaudio.transforms as at\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:\n",
+    "    waveform, sr = torchaudio.load(wave_path, normalize=True)\n",
+    "    if sample_rate != sr:\n",
+    "        waveform = at.Resample(sr, sample_rate)(waveform)\n",
+    "    return waveform\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):\n",
+    "    audio_transcript_pair_list = []\n",
+    "    if phase == 'train':\n",
+    "        csv_file = 'vin_train.csv'\n",
+    "    else:\n",
+    "        csv_file = 'vin_test.csv'\n",
+    "    df = pd.read_csv(csv_file)\n",
+    "    for index, row in df.iterrows():\n",
+    "        new_path = Path(row['path'])\n",
+    "        audio_id = index\n",
+    "        text = row['sentence']\n",
+    "        if new_path.exists():\n",
+    "            audio = load_wave(new_path, sample_rate=sample_rate)[0]\n",
+    "            # if len(text) > text_max_length or len(audio) > audio_max_sample_length:\n",
+    "            #     print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))\n",
+    "            #     continue\n",
+    "            audio_transcript_pair_list.append((audio_id, str(new_path), text))\n",
+    "            print(audio_transcript_pair_list)\n",
+    "    return audio,  audio_transcript_pair_list\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_list_files_vin100h(phase='train')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DUY",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/MITI.ipynb ADDED Viewed

	@@ -0,0 +1,342 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "\n",
+    "def count_files_by_extension(path, extension):\n",
+    "    \"\"\"\n",
+    "    path : root path to check,\n",
+    "    extension : .wav, ...\n",
+    "    \"\"\"\n",
+    "    total_count = 0\n",
+    "    \n",
+    "    for foldername, subfolders, filenames in os.walk(path):\n",
+    "        files = glob.glob(os.path.join(foldername, f\"*.{extension}\"))\n",
+    "        total_count += len(files)\n",
+    "    \n",
+    "    return total_count\n",
+    "\n",
+    "\n",
+    "root_path = \"./Cleaned_MITI/dataset_2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_wav_files = count_files_by_extension(root_path, \"wav\")\n",
+    "num_txt_files = count_files_by_extension(root_path, \"txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Số lượng file WAV: 2099\n",
+      "Số lượng file text: 2099\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Số lượng file WAV: {num_wav_files}\")\n",
+    "print(f\"Số lượng file text: {num_txt_files}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tần số mẫu (sample rate): 44100 Hz\n",
+      "Số kênh (channels): 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "import wave\n",
+    "\n",
+    "\n",
+    "def get_random_wav_file_info(folder_path):\n",
+    "    for foldername, subfolders, filenames in os.walk(folder_path):     \n",
+    "        wav_files = glob.glob(f\"{foldername}/*.wav\")\n",
+    "    \n",
+    "    if not wav_files:\n",
+    "        return None, None\n",
+    "    \n",
+    "    random_wav_file = random.choice(wav_files)\n",
+    "    \n",
+    "    with wave.open(random_wav_file, 'rb') as wav_file:\n",
+    "        sample_rate = wav_file.getframerate()\n",
+    "        channels = wav_file.getnchannels()\n",
+    "    \n",
+    "    return sample_rate, channels\n",
+    "\n",
+    "path_to_wav_folder = \"./Cleaned_MITI/dataset_2/\"\n",
+    "\n",
+    "sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)\n",
+    "\n",
+    "if sample_rate is not None and channels is not None:\n",
+    "    print(f\"Tần số mẫu (sample rate): {sample_rate} Hz\")\n",
+    "    print(f\"Số kênh (channels): {channels}\")\n",
+    "else:\n",
+    "    print(\"Nothing.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_special_characters(input_string):\n",
+    "    special_characters = ['.', ',', '-', '_', \" \"]\n",
+    "    \n",
+    "    # Duyệt qua từng ký tự trong chuỗi\n",
+    "    filtered_string = ''.join([char for char in input_string if char not in special_characters])\n",
+    "    \n",
+    "    return filtered_string\n",
+    "\n",
+    "# Sử dụng hàm\n",
+    "input_string = \"Hello, this_is_a-test.string!\"\n",
+    "output_string = remove_special_characters(input_string)\n",
+    "print(output_string)  # Kết quả: \"Hello thisisa teststring\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 84%|████████▎ | 164/196 [00:00<00:00, 1629.92it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 196/196 [00:00<00:00, 1580.86it/s]\n",
+      "100%|██████████| 218/218 [00:00<00:00, 1440.12it/s]\n",
+      "100%|██████████| 216/216 [00:00<00:00, 1364.20it/s]\n",
+      "100%|██████████| 205/205 [00:00<00:00, 1412.14it/s]\n",
+      "100%|██████████| 204/204 [00:00<00:00, 1426.29it/s]\n",
+      "100%|██████████| 220/220 [00:00<00:00, 1511.87it/s]\n",
+      "100%|██████████| 225/225 [00:00<00:00, 1499.30it/s]\n",
+      "100%|██████████| 175/175 [00:00<00:00, 1492.85it/s]\n",
+      "100%|██████████| 220/220 [00:00<00:00, 1496.34it/s]\n",
+      "100%|██████████| 220/220 [00:00<00:00, 1480.81it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "from tqdm import tqdm\n",
+    "import glob\n",
+    "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
+    "normalizer = BasicTextNormalizer()\n",
+    "def create_csv_from_wav_folder(folder_path, output_csv_file):\n",
+    "    with open(output_csv_file, mode='w', newline='') as csv_file:\n",
+    "        csv_writer = csv.writer(csv_file)\n",
+    "        csv_writer.writerow(['path', 'name', 'sentence'])\n",
+    "\n",
+    "        for person_foldername, _, _ in os.walk(folder_path):\n",
+    "            if \"person_\" in person_foldername:\n",
+    "                wav_files = glob.glob(os.path.join(person_foldername, \"*.wav\"))\n",
+    "\n",
+    "                for wav_file_path in tqdm(wav_files):\n",
+    "                    wav_filename = os.path.basename(wav_file_path)\n",
+    "                    text_filename = os.path.splitext(wav_filename)[0] + \".txt\"\n",
+    "                    text_file_path = os.path.join(person_foldername, text_filename)\n",
+    "\n",
+    "                    if os.path.exists(text_file_path):\n",
+    "                        with open(text_file_path, 'r') as txt_file:\n",
+    "                           text_content =  normalizer(txt_file.read())\n",
+    "                    else:\n",
+    "                        text_content = \"Not found.\"\n",
+    "\n",
+    "                    csv_writer.writerow([wav_file_path, wav_filename, text_content])\n",
+    "\n",
+    "root_path = \"./Cleaned_MITI/dataset_2\"  \n",
+    "output_csv_file = \"MITI.csv\"  \n",
+    "\n",
+    "create_csv_from_wav_folder(root_path, output_csv_file)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2099"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd \n",
+    "data = pd.read_csv('MITI.csv')\n",
+    "len(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import random\n",
+    "\n",
+    "def split_csv_file(input_file, output_file1, output_file2, ratio):\n",
+    "    with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:\n",
+    "        csvreader = csv.reader(csvfile)\n",
+    "        header = next(csvreader) \n",
+    "        \n",
+    "        data = list(csvreader)\n",
+    "        random.shuffle(data)\n",
+    "\n",
+    "        total_rows = len(data)\n",
+    "        rows_output_file1 = int(total_rows * ratio)\n",
+    "        rows_output_file2 = total_rows - rows_output_file1\n",
+    "        \n",
+    "        # Split the data into two parts\n",
+    "        data1 = data[:rows_output_file1]\n",
+    "        data2 = data[rows_output_file1:]\n",
+    "\n",
+    "    with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:\n",
+    "        csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
+    "        csvwriter1.writerow(header)\n",
+    "        csvwriter1.writerows(data1)\n",
+    "\n",
+    "    with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:\n",
+    "        csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
+    "        csvwriter2.writerow(header)\n",
+    "        csvwriter2.writerows(data2)\n",
+    "\n",
+    "input_file = 'MITI.csv'\n",
+    "output_file1 = 'MITI_train.csv'\n",
+    "output_file2 = 'MITI_test.csv'\n",
+    "ratio = 0.8  \n",
+    "\n",
+    "split_csv_file(input_file, output_file1, output_file2, ratio)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, DatasetDict\n",
+    "\n",
+    "vivos = DatasetDict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "\n",
+    "import pandas as pd\n",
+    "import whisper\n",
+    "import torchaudio.transforms as at\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:\n",
+    "    waveform, sr = torchaudio.load(wave_path, normalize=True)\n",
+    "    if sample_rate != sr:\n",
+    "        waveform = at.Resample(sr, sample_rate)(waveform)\n",
+    "    return waveform\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):\n",
+    "    audio_transcript_pair_list = []\n",
+    "    if phase == 'train':\n",
+    "        csv_file = 'vin_train.csv'\n",
+    "    else:\n",
+    "        csv_file = 'vin_test.csv'\n",
+    "    df = pd.read_csv(csv_file)\n",
+    "    for index, row in df.iterrows():\n",
+    "        new_path = Path(row['path'])\n",
+    "        audio_id = index\n",
+    "        text = row['sentence']\n",
+    "        if new_path.exists():\n",
+    "            audio = load_wave(new_path, sample_rate=sample_rate)[0]\n",
+    "            # if len(text) > text_max_length or len(audio) > audio_max_sample_length:\n",
+    "            #     print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))\n",
+    "            #     continue\n",
+    "            audio_transcript_pair_list.append((audio_id, str(new_path), text))\n",
+    "            print(audio_transcript_pair_list)\n",
+    "    return audio,  audio_transcript_pair_list\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_list_files_vin100h(phase='train')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DUY",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/download_quantized.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import logging
+import os
+import re
+from typing import Optional
+import huggingface_hub
+import requests
+from tqdm.auto import tqdm
+_MODELS = (
+ "medium"
+)
+def get_assets_path():
+    """Returns the path to the assets directory."""
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
+def get_logger():
+    """Returns the module logger."""
+    return logging.getLogger("faster_whisper")
+def download_model(
+    size_or_id: str,
+    output_dir: Optional[str] = None,
+    local_files_only: bool = False,
+    cache_dir: Optional[str] = None,
+):
+    """Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
+    The model is downloaded from https://huggingface.co/DuyTa.
+    Args:
+      size_or_id: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en,
+        medium, medium.en, large-v1, or large-v2), or a CTranslate2-converted model ID
+        from the Hugging Face Hub (e.g. guillaumekln/faster-whisper-large-v2).
+      output_dir: Directory where the model should be saved. If not set, the model is saved in
+        the cache directory.
+      local_files_only:  If True, avoid downloading the file and return the path to the local
+        cached file if it exists.
+      cache_dir: Path to the folder where cached files are stored.
+    Returns:
+      The path to the downloaded model.
+    Raises:
+      ValueError: if the model size is invalid.
+    """
+    if re.match(r".*/.*", size_or_id):
+        repo_id = size_or_id
+    else:
+        if size_or_id not in _MODELS:
+            raise ValueError(
+                "Invalid model size '%s', expected one of: %s"
+                % (size_or_id, ", ".join(_MODELS))
+            )
+        #repo_id = "DuyTa/vi-whisper-%s-Lora" % size_or_id
+        repo_id = "DuyTa/Vietnamese_ASR"
+    allow_patterns = [
+        "config.json",
+        "model.bin",
+        "tokenizer.json",
+        "vocabulary.*",
+    ]
+    kwargs = {
+        "local_files_only": local_files_only,
+        "allow_patterns": allow_patterns,
+        "tqdm_class": disabled_tqdm,
+    }
+    if output_dir is not None:
+        kwargs["local_dir"] = output_dir
+        kwargs["local_dir_use_symlinks"] = False
+    if cache_dir is not None:
+        kwargs["cache_dir"] = cache_dir
+    try:
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)
+    except (
+        huggingface_hub.utils.HfHubHTTPError,
+        requests.exceptions.ConnectionError,
+    ) as exception:
+        logger = get_logger()
+        logger.warning(
+            "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
+            repo_id,
+            exception,
+        )
+        logger.warning(
+            "Trying to load the model directly from the local cache, if it exists."
+        )
+        kwargs["local_files_only"] = True
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)
+def format_timestamp(
+    seconds: float,
+    always_include_hours: bool = False,
+    decimal_marker: str = ".",
+) -> str:
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+class disabled_tqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)

src/laboratory.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/lora_tuning.py ADDED Viewed

	@@ -0,0 +1,773 @@

+import argparse
+import gc
+import json
+import logging
+import math
+import os
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from random import randint
+from typing import Any, Dict, List, Union
+# datasets imports
+import datasets
+# metric imports
+import evaluate
+import numpy as np
+import torch
+import transformers
+import wandb
+# accelerate imports
+from accelerate import Accelerator, dispatch_model
+from accelerate.logging import get_logger
+from datasets import Audio, DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
+# hf imports
+from huggingface_hub import Repository
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    SchedulerType,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    get_scheduler,
+    set_seed,
+)
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer
+from transformers.utils import get_full_repo_name
+# peft imports
+from peft import AdaLoraConfig, LoraConfig, PeftModel, get_peft_model
+logger = get_logger(__name__, log_level="INFO")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Whisper Fine-Tuning with AdaLora")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument("--language", type=str, help="Language to use for training; e.g., 'Hindi' ", required=True)
+    parser.add_argument("--language_abbr", type=str, help="Language to use for training; e.g., 'hi' ", required=True)
+    parser.add_argument(
+        "--task", type=str, default="transcribe", help="Task to use for training; e.g., 'transcribe' ", required=False
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default="mozilla-foundation/common_voice_11_0",
+        help="Dataset to use for training; e.g., 'whisper' ",
+        required=False,
+    )
+    parser.add_argument(
+        "--dataset_in_streaming_mode",
+        action="store_true",
+        help="Whether to use streaming mode for the dataset.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="lowercase the transcribed text before tokenizing"
+    )
+    parser.add_argument(
+        "--do_remove_punctuation", action="store_true", help="remove punctuation from the transcribed text"
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--max_audio_input_length", type=float, default=30.0, help="Maximum audio length in seconds.")
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--buffer_size",
+        type=int,
+        default=5000,
+        help="Number of samples to prefetch in the streaming mode.",
+    )
+    parser.add_argument(
+        "--dataloader_pin_memory",
+        action="store_true",
+        help="Whether or not to pin memory for the DataLoader.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help="Number of subprocesses to use for data loading.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--load_best_model",
+        action="store_true",
+        help="Whether to load the best model at the end of training",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=100,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--evaluation_steps",
+        type=int,
+        default=500,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    # lora/adalora specific args
+    parser.add_argument(
+        "--use_peft",
+        action="store_true",
+        help="Whether to use PEFT",
+    )
+    parser.add_argument(
+        "--use_adalora",
+        action="store_true",
+        help="Whether to use AdaLoRA or LoRA. If set, uses AdaLoRA instead of the default LoRA.",
+    )
+    parser.add_argument(
+        "--init_r",
+        type=int,
+        default=12,
+        help="Initial AdaLoRA rank",
+    )
+    parser.add_argument(
+        "--target_r",
+        type=int,
+        default=4,
+        help="Target AdaLoRA rank",
+    )
+    parser.add_argument(
+        "--tinit",
+        type=int,
+        default=200,
+        help="number of warmup steps for AdaLoRA wherein no pruning is performed",
+    )
+    parser.add_argument(
+        "--tfinal",
+        type=int,
+        default=1000,
+        help=" fix the resulting budget distribution and fine-tune the model for tfinal steps when using AdaLoRA ",
+    )
+    parser.add_argument(
+        "--delta_t",
+        type=int,
+        default=10,
+        help="interval of steps for AdaLoRA to update rank",
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=32,
+        help="LORA alpha",
+    )
+    parser.add_argument(
+        "--r",
+        type=int,
+        default=8,
+        help="LORA rank",
+    )
+    parser.add_argument(
+        "--lora_dropout",
+        type=float,
+        default=0.1,
+        help="LORA dropout",
+    )
+    parser.add_argument(
+        "--orth_reg_weight",
+        type=float,
+        default=0.5,
+        help="Orthogonal regularization weight",
+    )
+    parser.add_argument(
+        "--debug_mode",
+        action="store_true",
+        help="Whether to use debug mode",
+    )
+    args = parser.parse_args()
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+    return args
+def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):
+    if "+" in split:
+        # load multiple splits separated by the `+` symbol *with* streaming mode
+        dataset_splits = [
+            load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs)
+            for split_name in split.split("+")
+        ]
+        # interleave multiple splits to form one dataset
+        interleaved_dataset = interleave_datasets(dataset_splits)
+        return interleaved_dataset
+    else:
+        # load a single split *with* streaming mode
+        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)
+        return dataset
+def prepare_dataset_wrapper(do_lower_case, do_remove_punctuation, processor, normalizer):
+    def prepare_dataset(batch):
+        # load and (possibly) resample audio data to 16kHz
+        audio = batch["audio"]
+        # compute log-Mel input features from input audio array
+        batch["input_features"] = processor.feature_extractor(
+            audio["array"], sampling_rate=audio["sampling_rate"]
+        ).input_features[0]
+        # compute input length of audio sample in seconds
+        batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
+        # optional pre-processing steps
+        transcription = batch["sentence"]
+        if do_lower_case:
+            transcription = transcription.lower()
+        if do_remove_punctuation:
+            transcription = normalizer(transcription).strip()
+        # encode target text to label ids
+        batch["labels"] = processor.tokenizer(transcription).input_ids
+        return batch
+    return prepare_dataset
+def save_model_hook(models, weights, output_dir):
+    for model in models:
+        model.save_pretrained(output_dir)
+        # make sure to pop weight so that corresponding model is not saved again
+        weights.pop()
+def load_model_hook(models, input_dir):
+    while len(models) > 0:
+        model = models.pop()
+        # pop models so that they are not loaded again
+        PeftModel.from_pretrained(model.base_model.model, input_dir)
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        batch["labels"] = labels
+        return batch
+def get_audio_length_processor(max_input_length):
+    def is_audio_in_length_range(length):
+        return length < max_input_length
+    return is_audio_in_length_range
+def evaluation_loop(model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator):
+    model.eval()
+    predictions = []
+    references = []
+    normalized_predictions = []
+    normalized_references = []
+    for _, batch in enumerate(tqdm(eval_dataloader)):
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                generated_tokens = (
+                    model.generate(
+                        input_features=batch["input_features"],
+                        forced_decoder_ids=forced_decoder_ids,
+                        max_new_tokens=255,
+                    )
+                    .cpu()
+                    .numpy()
+                )
+                labels = batch["labels"].cpu().numpy()
+                labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
+                decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+                decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
+                predictions.extend(decoded_preds)
+                references.extend(decoded_labels)
+                normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
+                normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
+            del generated_tokens, labels, batch
+        gc.collect()
+    wer = 100 * metric.compute(predictions=predictions, references=references)
+    normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
+    eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}
+    if accelerator.get_tracker("wandb"):
+        sample_size = min(len(predictions), 256)
+        ids = [randint(0, len(predictions) - 1) for p in range(0, sample_size)]
+        sample_predictions = [predictions[i] for i in ids]
+        sample_references = [references[i] for i in ids]
+        sample_normalized_predictions = [normalized_predictions[i] for i in ids]
+        sample_normalized_references = [normalized_references[i] for i in ids]
+        table_rows = [
+            list(r)
+            for r in zip(
+                sample_predictions, sample_references, sample_normalized_predictions, sample_normalized_references
+            )
+        ]
+        eval_metrics["eval_samples"] = wandb.Table(
+            columns=["predictions", "references", "normalized_predictions", "normalized_references"],
+            rows=table_rows,
+        )
+    return eval_metrics
+def main():
+    args = parse_args()
+    # initialize accelerator
+    accelerator = (
+        Accelerator(
+            log_with=args.report_to,
+            project_dir=args.output_dir,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+        )
+        if args.with_tracking
+        else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+    # load dataset either in streaming mode or not
+    processor = WhisperProcessor.from_pretrained(args.model_name_or_path, language=args.language, task=args.task)
+    normalizer = BasicTextNormalizer()
+    prepare_dataset = prepare_dataset_wrapper(args.do_lower_case, args.do_remove_punctuation, processor, normalizer)
+    is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length)
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+    if args.dataset_in_streaming_mode:
+        raw_datasets = IterableDatasetDict()
+        loading_method = load_streaming_dataset
+    else:
+        raw_datasets = DatasetDict()
+        loading_method = load_dataset
+    if args.debug_mode:
+        train_split = "train[:100]"
+        test_split = "test[:10]"
+    else:
+        train_split = "train+validation"
+        test_split = "test"
+    raw_datasets["train"] = loading_method(
+        args.dataset_name, args.language_abbr, split=train_split, use_auth_token=True
+    )
+    raw_datasets["test"] = loading_method(args.dataset_name, args.language_abbr, split=test_split, use_auth_token=True)
+    raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000))
+    logger.info("Dataset loaded: %s", raw_datasets)
+    logger.info(f'{raw_datasets["train"][0]}')
+    vectorized_datasets = raw_datasets.map(
+        prepare_dataset,
+        remove_columns=list(next(iter(raw_datasets.values())).features),
+        num_proc=args.preprocessing_num_workers,
+    ).with_format("torch")
+    if args.dataset_in_streaming_mode:
+        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
+            buffer_size=args.buffer_size,
+            seed=args.seed,
+        )
+    # filter out audio files that are too long from the training set
+    is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length)
+    vectorized_datasets["train"] = vectorized_datasets["train"].filter(
+        is_audio_in_length_range, input_columns=["input_length"]
+    )
+    # get dataloaders
+    train_dataloader = DataLoader(
+        vectorized_datasets["train"],
+        batch_size=args.per_device_train_batch_size,
+        shuffle=True,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=args.dataloader_pin_memory,
+    )
+    eval_dataloader = DataLoader(
+        vectorized_datasets["test"],
+        batch_size=args.per_device_eval_batch_size,
+        collate_fn=data_collator,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=args.dataloader_pin_memory,
+    )
+    # metric
+    metric = evaluate.load("wer")
+    # model
+    model = WhisperForConditionalGeneration.from_pretrained(args.model_name_or_path, load_in_8bit=True)
+    model.config.forced_decoder_ids = None
+    model.config.suppress_tokens = []
+    if len(set(model.hf_device_map.values()).intersection({"cpu", "disk"})) > 0:
+        raise ValueError("Training on CPU or disk is not supported.")
+    if len(set(model.hf_device_map.values())) > 1:
+        device_map = model.hf_device_map.copy()
+        # required because `labels` are on main execution device (0) while the output of `proj_out` is on other device.
+        # So, this leads to device mismatch error when calculation cross-entropy between logits and labels.
+        # Won't arise during inference as `labels` aren't supplied during that time
+        # instead of changing device of one of the tied modules, I have to do this for all tied modules
+        # else the execution device of remaining tied modules isn't changed
+        device_map["model.decoder.embed_tokens"] = model._hf_hook.execution_device
+        device_map["model.decoder.embed_positions"] = model._hf_hook.execution_device
+        device_map["proj_out"] = model._hf_hook.execution_device
+        dispatch_model(model, device_map=device_map)
+    # preparing peft model
+    if args.use_peft:
+        from peft import prepare_model_for_int8_training
+        model = prepare_model_for_int8_training(model)
+        # as Whisper model uses Conv layer in encoder, checkpointing disables grad computation
+        # to avoid this, make the inputs trainable
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)
+        # wrapping model with adalora tuner
+        if args.use_adalora:
+            config = AdaLoraConfig(
+                init_r=args.init_r,
+                target_r=args.target_r,
+                beta1=0.85,
+                beta2=0.85,
+                tinit=args.tinit,
+                tfinal=args.tfinal,
+                deltaT=args.delta_t,
+                lora_alpha=args.lora_alpha,
+                lora_dropout=args.lora_dropout,
+                target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+                orth_reg_weight=args.orth_reg_weight,
+            )
+        else:
+            config = LoraConfig(
+                r=args.r,
+                lora_alpha=args.lora_alpha,
+                target_modules=["q_proj", "v_proj"],
+                lora_dropout=args.lora_dropout,
+            )
+        model = get_peft_model(model, config)
+        model.print_trainable_parameters()
+    # optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
+    if args.max_train_steps is None:
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # scheduler
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+    accelerator.print(model)
+    # Note here that the max steps is adjusted by the accelerator's num_processes
+    args.max_train_steps = math.ceil(args.max_train_steps / accelerator.num_processes)
+    if args.use_peft and args.use_adalora:
+        model.base_model.peft_config["default"].total_step = args.max_train_steps
+        # model.base_model.peft_config.total_step = args.max_train_steps
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        run_name = f"run-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers(
+            "Whisper PEFT Fine-Tuning", config=experiment_config, init_kwargs={"wandb": {"name": run_name}}
+        )
+    # saving and loading checkpoints for resuming training
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    global_step = 0
+    starting_epoch = 0
+    best_metric = None
+    resume_step = 0
+    forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task)
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        accelerator.load_state(args.resume_from_checkpoint)
+        path = os.path.basename(args.resume_from_checkpoint)
+        training_difference = os.path.splitext(path)[0]
+        global_step = resume_step = int(training_difference.replace("step_", ""))
+        starting_epoch = resume_step // len(train_dataloader)
+        resume_step -= starting_epoch * len(train_dataloader)
+    # We need to adjust the progress bar to the current step
+    progress_bar.update(resume_step)
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+            running_loss = 0
+        for step, batch in enumerate(accelerator.skip_first_batches(train_dataloader, num_batches=resume_step)):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                # Update the importance of low-rank matrices
+                # and allocate the budget accordingly.
+                # This is only needed for AdaLora.
+                # Note that this requires parameter gradients.
+                # Hence being called before optimizer.zero_grad().
+                if args.use_peft and args.use_adalora:
+                    model.update_and_allocate(global_step)
+                optimizer.zero_grad()
+                global_step += 1
+                progress_bar.update(1)
+            if args.with_tracking:
+                step_loss = accelerator.reduce(loss.detach().clone()).item()
+                total_loss += step_loss
+                running_loss += step_loss
+            if global_step % args.checkpointing_steps == 0:
+                output_dir = os.path.join(args.output_dir, f"step_{global_step}")
+                accelerator.save_state(output_dir)
+            if global_step % args.logging_steps == 0:
+                if args.with_tracking:
+                    accelerator.log({"train/running_loss": running_loss / args.logging_steps}, step=global_step)
+                    running_loss = 0
+            if global_step % args.evaluation_steps == 0:
+                eval_metrics = evaluation_loop(
+                    model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+                )
+                if args.with_tracking:
+                    logger.info(f"Step {global_step} eval metrics: {eval_metrics}")
+                    accelerator.log(eval_metrics, step=global_step)
+                if best_metric is None or eval_metrics["eval/wer"] < best_metric:
+                    best_metric = eval_metrics["eval/wer"]
+                    accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint"))
+                model.train()
+            if global_step >= args.max_train_steps:
+                break
+        if args.with_tracking:
+            train_epoch_loss = total_loss / (step + 1)
+            logger.info(f"Epoch {epoch} train loss: {train_epoch_loss}")
+            accelerator.log({"epoch/train_loss": train_epoch_loss}, step=epoch)
+        if args.push_to_hub and epoch <= args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process)
+            # evaluate the model at the end of training
+            eval_metrics = evaluation_loop(
+                model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+            )
+            if args.with_tracking:
+                logger.info(f"Step {global_step} eval metrics: {eval_metrics}")
+                accelerator.log(eval_metrics, step=global_step)
+            if best_metric is None or eval_metrics["eval/wer"] < best_metric:
+                best_metric = eval_metrics["eval/wer"]
+                accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint"))
+            if accelerator.is_main_process:
+                processor.tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+    if args.load_best_model:
+        # load the best model
+        accelerator.load_state(os.path.join(args.output_dir, "best_checkpoint"))
+        model.resize_modules_by_rank_pattern(model.peft_config["default"].rank_pattern, "default")
+        eval_metrics = evaluation_loop(
+            model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator
+        )
+        if args.with_tracking:
+            best_metrics = {"best_" + k: v for k, v in eval_metrics.items()}
+            accelerator.log(best_metrics, step=global_step)
+    accelerator.wait_for_everyone()
+    unwrapped_model = accelerator.unwrap_model(model)
+    unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process)
+    if accelerator.is_main_process:
+        processor.tokenizer.save_pretrained(args.output_dir)
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+    with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+        eval_metrics.pop("eval_samples")
+        json.dump(eval_metrics, f)
+if __name__ == "__main__":
+    main()

src/merge_lora.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse
+import functools
+import os
+from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizerFast,\
+    WhisperProcessor
+from peft import PeftModel, PeftConfig
+from utils.utils import print_arguments, add_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg("lora_model", type=str, default="output/whisper-tiny/checkpoint-best/", help="微调保存的模型路径")
+add_arg('output_dir', type=str, default='models/',    help="合并模型的保存目录")
+add_arg("local_files_only", type=bool, default=False, help="是否只在本地加载模型，不尝试下载")
+args = parser.parse_args()
+print_arguments(args)
+assert os.path.exists(args.lora_model), f"模型文件{args.lora_model}不存在"
+peft_config = PeftConfig.from_pretrained(args.lora_model)
+#
+base_model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, device_map={"": "cpu"},
+                                                             local_files_only=args.local_files_only)
+model = PeftModel.from_pretrained(base_model, args.lora_model, local_files_only=args.local_files_only)
+feature_extractor = WhisperFeatureExtractor.from_pretrained(peft_config.base_model_name_or_path,
+                                                            local_files_only=args.local_files_only)
+tokenizer = WhisperTokenizerFast.from_pretrained(peft_config.base_model_name_or_path,
+                                                 local_files_only=args.local_files_only)
+processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,
+                                             local_files_only=args.local_files_only)
+model = model.merge_and_unload()
+model.train(False)
+save_directory = os.path.join(args.output_dir, f'{os.path.basename(peft_config.base_model_name_or_path)}-finetune')
+os.makedirs(save_directory, exist_ok=True)
+model.save_pretrained(save_directory)
+feature_extractor.save_pretrained(save_directory)
+tokenizer.save_pretrained(save_directory)
+processor.save_pretrained(save_directory)
+print(f'合并模型保持在：{save_directory}')

src/prepare_data.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import logging
+import datasets
+from datasets import DatasetDict, load_dataset, concatenate_datasets
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForSpeechSeq2Seq,
+    AutoTokenizer,
+    set_seed,
+)
+from transformers.utils.versions import require_version
+from transformers.utils import check_min_version
+from tqdm import tqdm
+from audiomentations import (
+    AddBackgroundNoise,
+    AddGaussianNoise,
+    Compose,
+    Gain,
+    OneOf,
+    PitchShift,
+    PolarityInversion,
+    TimeStretch,
+)
+check_min_version("4.27.0.dev0")
+require_version(
+    "datasets>=1.18.0",
+    "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
+)
+logger = logging.getLogger(__name__)
+from datasets import Dataset, DatasetDict
+import torchaudio
+from torchaudio import transforms as at
+import pandas as pd
+import torch
+from pathlib import Path
+import random
+def main():
+    # Set seed before initializing model.
+    set_seed(42)
+    # 5. Load pretrained model, tokenizer, and feature extractor
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    config = AutoConfig.from_pretrained(
+        "openai/whisper-medium", revision="main", use_auth_token=True
+    )
+    config.update({"forced_decoder_ids": None, "suppress_tokens": None})
+    # *****************************SpecAugment for whisper models
+    # if getattr(config, "model_type", None) == "whisper":
+    config.update({"apply_spec_augment": True})
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        "openai/whisper-medium",
+        revision="main",
+        use_auth_token=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        "openai/whisper-medium",
+        use_fast=True,
+        revision="main",
+        use_auth_token=True,
+    )
+    tokenizer.set_prefix_tokens(language="vi", task="transcribe")
+    # 7. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = 30.0 * 16000
+    min_input_length = 0.0 * 16000
+    audio_column_name = "audio"
+    num_workers = 16
+    text_column_name = "text"
+    model_input_name = feature_extractor.model_input_names[0]
+    # if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
+    forward_attention_mask = True
+    # noise_dir = "../noise/ESC-50-master/audio/"
+    # define augmentation
+    augmentation = Compose(
+        [
+            TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2, leave_length_unchanged=True),
+            Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.1),
+            PitchShift(min_semitones=-4, max_semitones=4, p=0.2),
+        ]
+    )
+    def augment_dataset(batch):
+        # load and (possibly) resample audio data to 16kHz
+        sample = batch["audio"]
+        # apply augmentation
+        augmented_waveform = augmentation(
+            sample, sample_rate=16000
+        )
+        batch["audio"]["array"] = augmented_waveform
+        return batch
+    def prepare_dataset(batch):
+        # process audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(
+            sample,
+            sampling_rate= 16000,
+            return_attention_mask=forward_attention_mask,
+        )
+        # process audio length
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample)
+        if forward_attention_mask:
+            batch["attention_mask"] = inputs.get("attention_mask")[0]
+        # process targets
+        input_str = batch[text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:
+        waveform, sr = torchaudio.load(wave_path, normalize=True)
+        if sample_rate != sr:
+            waveform = at.Resample(sr, sample_rate)(waveform)
+        return waveform
+    def get_list_files_MITI(phase, sample_rate=16000, audio_max_sample_length=480000, fraction=0.15):
+        audio_list = []
+        text_list = []
+        if phase == 'train':
+            csv_file = 'vin_train.csv'
+        else:
+            csv_file = 'vin_test.csv'
+        df = pd.read_csv(csv_file)
+        # Calculate the number of samples to select based on the fraction
+        num_samples = int(len(df) * fraction)
+        # Randomly select the indices of samples
+        selected_indices = random.sample(range(len(df)), num_samples)
+        for index, row in tqdm(df.iterrows()):
+            if index not in selected_indices:
+                continue
+            new_path = Path(row['path'])
+            audio_id = index
+            text = row['sentence']
+            if new_path.exists():
+                audio = load_wave(new_path, sample_rate=sample_rate)[0]
+                if len(audio) > audio_max_sample_length or len(audio) < 0:
+                    print('skip file:', new_path, 'with len audio', len(audio))
+                    continue
+            audio_list.append(audio)
+            text_list.append(text)
+        return audio_list, text_list
+    # Assuming you have two CSV files, 'vin_train.csv' and 'vin_test.csv', in the same directory
+    # Get the training dataset
+    train_audio, train_text = get_list_files_MITI(phase='train')
+    # Get the testing dataset
+    test_audio, test_text = get_list_files_MITI(phase='test')
+    # Create the Dataset objects
+    train_dataset = Dataset.from_dict({"audio": train_audio, "text": train_text})
+    test_dataset = Dataset.from_dict({"audio": test_audio, "text": test_text})
+    # Create the DatasetDict
+    vin_100h = DatasetDict({"train": train_dataset, "test": test_dataset})
+    print(vin_100h)
+    vectorized_datasets = vin_100h.map(
+        prepare_dataset,
+        remove_columns=["audio", "text"],
+        num_proc=1,
+        desc="preprocess train dataset",
+    )
+    print(vectorized_datasets)
+    vectorized_datasets.save_to_disk(
+        "./vin_10h", num_proc=1
+    )
+    return
+if __name__ == "__main__":
+    main()

src/realtime.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#! python3.7
+import argparse
+import io
+import os
+import speech_recognition as sr
+import whisperx
+import torch
+from datetime import datetime, timedelta
+from queue import Queue
+from tempfile import NamedTemporaryFile
+from time import sleep
+from sys import platform
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Vietnamese_ASR/ct2ranslate", help="Size of model or the local path for model ",
+                        type=str)
+    parser.add_argument("--non_english", action='store_true',
+                        help="Don't use the English model.")
+    parser.add_argument("--language", default="vi", help="The language to infer the model with whisper", type=str)
+    parser.add_argument("--device", default="cpu",
+                        help="Choose device for inference "
+                        , type=str)
+    parser.add_argument("--energy_threshold", default=900,
+                        help="Energy level for mic to detect.", type=int)
+    parser.add_argument("--record_timeout", default=0.6,
+                        help="How real-time the recording is in seconds.", type=float)
+    parser.add_argument("--phrase_timeout", default=3,
+                        help="How much empty space between recordings before we "
+                            "consider it a new line in the transcription.", type=float)
+    if 'linux' in platform:
+        parser.add_argument("--default_microphone", default='pulse',
+                            help="Default microphone name for SpeechRecognition. "
+                                "Run this with 'list' to view available Microphones.", type=str)
+    args = parser.parse_args()
+    # The last time a recording was retreived from the queue.
+    phrase_time = None
+    # Current raw audio bytes.
+    last_sample = bytes()
+    # Thread safe Queue for passing data from the threaded recording callback.
+    data_queue = Queue()
+    # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
+    recorder = sr.Recognizer()
+    recorder.energy_threshold = args.energy_threshold
+    # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
+    recorder.dynamic_energy_threshold = False
+    # Important for linux users.
+    # Prevents permanent application hang and crash by using the wrong Microphone
+    if 'linux' in platform:
+        mic_name = args.default_microphone
+        if not mic_name or mic_name == 'list':
+            print("Available microphone devices are: ")
+            for index, name in enumerate(sr.Microphone.list_microphone_names()):
+                print(f"Microphone with name \"{name}\" found")
+            return
+        else:
+            for index, name in enumerate(sr.Microphone.list_microphone_names()):
+                if mic_name in name:
+                    source = sr.Microphone(sample_rate=16000, device_index=index)
+                    break
+    else:
+        source = sr.Microphone(sample_rate=16000)
+    # Load / Download model
+    model = args.model
+    # if args.model != "large" and not args.non_english:
+    #     model = model + ".en"
+    audio_model = whisperx.load_model(model, device=args.device, compute_type="float16", language = args.language)
+    record_timeout = args.record_timeout
+    phrase_timeout = args.phrase_timeout
+    temp_file = NamedTemporaryFile().name
+    transcription = ['']
+    with source:
+        recorder.adjust_for_ambient_noise(source)
+    def record_callback(_, audio:sr.AudioData) -> None:
+        """
+        Threaded callback function to recieve audio data when recordings finish.
+        audio: An AudioData containing the recorded bytes.
+        """
+        # Grab the raw bytes and push it into the thread safe queue.
+        data = audio.get_raw_data()
+        data_queue.put(data)
+    # Create a background thread that will pass us raw audio bytes.
+    # We could do this manually but SpeechRecognizer provides a nice helper.
+    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
+    # Cue the user that we're ready to go.
+    print("Model loaded.\n")
+    while True:
+        try:
+            now = datetime.utcnow()
+            # Pull raw recorded audio from the queue.
+            if not data_queue.empty():
+                phrase_complete = False
+                # If enough time has passed between recordings, consider the phrase complete.
+                # Clear the current working audio buffer to start over with the new data.
+                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
+                    last_sample = bytes()
+                    phrase_complete = True
+                # This is the last time we received new audio data from the queue.
+                phrase_time = now
+                # Concatenate our current audio data with the latest audio data.
+                while not data_queue.empty():
+                    data = data_queue.get()
+                    last_sample += data
+                # Use AudioData to convert the raw data to wav data.
+                audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
+                wav_data = io.BytesIO(audio_data.get_wav_data())
+                # Write wav data to the temporary file as bytes.
+                with open(temp_file, 'w+b') as f:
+                    f.write(wav_data.read())
+                # Read the transcription.
+                result = audio_model.transcribe(temp_file, language="en",batch_size = 8)
+                text = result['segments'][0]['text'].strip()
+                # If we detected a pause between recordings, add a new item to our transcripion.
+                # Otherwise edit the existing one.
+                if phrase_complete:
+                    transcription.append(text)
+                else:
+                    transcription[-1] = text
+                # Clear the console to reprint the updated transcription.
+                os.system('cls' if os.name=='nt' else 'clear')
+                for line in transcription:
+                    print(line)
+                # Flush stdout.
+                print('', end='', flush=True)
+                # Infinite loops are bad for processors, must sleep.
+                sleep(0.25)
+        except KeyboardInterrupt:
+            break
+    print("\n\nTranscription:")
+    for line in transcription:
+        print(line)
+if __name__ == "__main__":
+    main()

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+git+https://github.com/huggingface/peft.git@main
+bitsandbytes
+accelerate
+loralib
+librosa
+datasets>=2.6.1
+evaluate>=0.3.0
+jiwer
+tensorboard
+soundfile==0.12.1
+git+https://github.com/m-bain/whisperX.git
+#nvidia-cudnn-cu11-8.7.0.84 need
+lightning-fabric
+pyaudio
+SpeechRecognition

src/test_whisper.ipynb ADDED Viewed

	@@ -0,0 +1,1546 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d7b03aae28b4282b143eb17c3d8d687",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset vivos (/home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0)\n",
+      "Found cached dataset vivos (/home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset, DatasetDict\n",
+    "\n",
+    "vivos = DatasetDict()\n",
+    "\n",
+    "vivos[\"train\"] = load_dataset(\"vivos\", split=\"train\", use_auth_token=True)\n",
+    "vivos[\"test\"] = load_dataset(\"vivos\", split=\"test\", use_auth_token=True)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['speaker_id', 'path', 'audio', 'sentence'],\n",
+       "        num_rows: 11660\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['speaker_id', 'path', 'audio', 'sentence'],\n",
+       "        num_rows: 760\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vivos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vivos_clean = vivos.remove_columns([\"speaker_id\", \"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 11660\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 760\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vivos_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'audio': {'path': 'vivos/train/waves/VIVOSSPK27/VIVOSSPK27_084.wav', 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,\n",
+      "        9.15527344e-05, -5.18798828e-04, -9.15527344e-04]), 'sampling_rate': 16000}, 'sentence': 'CHƯA HẾT ĐI KHIẾU NẠI THÌ NHÀ MẠNG BẢO VỀ ĐẠI LÝ CHỌN SỐ KHÁC ĐI'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vivos_clean['train'][12])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset common_voice_13_0 (/home/tesla/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/vi/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055)\n",
+      "Found cached dataset common_voice_13_0 (/home/tesla/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/vi/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055)\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "common_voice = DatasetDict()\n",
+    "\n",
+    "common_voice[\"train\"] = load_dataset(\"mozilla-foundation/common_voice_13_0\", \"vi\", split=\"train+validation\", use_auth_token=True)\n",
+    "common_voice[\"test\"] = load_dataset(\"mozilla-foundation/common_voice_13_0\", \"vi\", split=\"test\", use_auth_token=True)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],\n",
+       "        num_rows: 2854\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],\n",
+       "        num_rows: 1225\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "common_voice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_voice_clean = common_voice.remove_columns([\"client_id\", \"path\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\", \"age\", \"accent\", \"variant\"])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 2854\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 1225\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "common_voice_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'common_voice_clean' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[39mreturn\u001b[39;00m example\n\u001b[1;32m      7\u001b[0m common_voice_clear \u001b[39m=\u001b[39m DatasetDict()\n\u001b[0;32m----> 9\u001b[0m common_voice_clear[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m common_voice_clean[\u001b[39m\"\u001b[39m\u001b[39mtrain\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mmap(convert_to_uppercase)\n\u001b[1;32m     10\u001b[0m common_voice_clear[\u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m common_voice_clean[\u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mmap(convert_to_uppercase)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'common_voice_clean' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import DatasetDict\n",
+    "\n",
+    "def convert_to_uppercase(example):\n",
+    "    example[\"sentence\"] = example[\"sentence\"].upper()\n",
+    "    return example\n",
+    "\n",
+    "common_voice_clear = DatasetDict()\n",
+    "\n",
+    "common_voice_clear[\"train\"] = common_voice_clean[\"train\"].map(convert_to_uppercase)\n",
+    "common_voice_clear[\"test\"] = common_voice_clean[\"test\"].map(convert_to_uppercase)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'audio': {'path': '/home/tesla/.cache/huggingface/datasets/downloads/extracted/acb70896120347904e003bb826dcabc1ddd05a02210935cb44ce1c807e8742a5/vi_train_0/common_voice_vi_23901118.mp3', 'array': array([ 0.00000000e+00,  4.20543185e-14,  1.38823347e-14, ...,\n",
+      "       -8.41874498e-06, -8.36193431e-06, -6.76584477e-06]), 'sampling_rate': 48000}, 'sentence': 'KHI CON CÓ MẸ'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(common_voice_clear['train'][1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2854/2854 [33:25<00:00,  1.42it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pydub import AudioSegment\n",
+    "import os\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "def convert_mp3_to_wav(mp3_path, wav_path, target_sampling_rate):\n",
+    "    audio = AudioSegment.from_mp3(mp3_path)\n",
+    "    audio = audio.set_frame_rate(target_sampling_rate)\n",
+    "    audio.export(wav_path, format='wav')\n",
+    "\n",
+    "target_sampling_rate = 16000\n",
+    "\n",
+    "for example in tqdm(common_voice_clear[\"train\"]):\n",
+    "    mp3_path = example[\"audio\"][\"path\"]\n",
+    "    wav_path = os.path.splitext(mp3_path)[0] + \".wav\"\n",
+    "    convert_mp3_to_wav(mp3_path, wav_path, target_sampling_rate)\n",
+    "    example[\"audio\"][\"path\"] = wav_path\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import Audio\n",
+    "\n",
+    "common_voice_clean = common_voice_clean.cast_column(\"audio\", Audio(sampling_rate=16000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "concat = DatasetDict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "concat[\"train\"] = datasets.concatenate_datasets([common_voice_clean[\"train\"], vivos_clean[\"train\"]])\n",
+    "\n",
+    "#concat['test']= datasets.concatenate_datasets([common_voice_clean[\"test\"], vivos_clean[\"test\"]])\n",
+    "concat['test']= vivos_clean[\"test\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 14514\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['audio', 'sentence'],\n",
+       "        num_rows: 760\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "concat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperFeatureExtractor\n",
+    "\n",
+    "feature_extractor = WhisperFeatureExtractor.from_pretrained(\"openai/whisper-small\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperTokenizerFast\n",
+    "\n",
+    "tokenizer = WhisperTokenizerFast.from_pretrained(\"openai/whisper-small\", language=\"Vietnamese\", task=\"transcribe\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input:                 KHÔNG CÓ AI BÁC BỎ QUYỀN ĐÓ\n",
+      "Decoded w/ special:    <|startoftranscript|><|notimestamps|>KHÔNG CÓ AI BÁC BỎ QUYỀN ĐÓ<|endoftext|>\n",
+      "Decoded w/out special: KHÔNG CÓ AI BÁC BỎ QUYỀN ĐÓ\n",
+      "Are equal:             True\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_str = concat[\"train\"][8550][\"sentence\"]\n",
+    "labels = tokenizer(input_str).input_ids\n",
+    "decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)\n",
+    "decoded_str = tokenizer.decode(labels, skip_special_tokens=True)\n",
+    "\n",
+    "print(f\"Input:                 {input_str}\")\n",
+    "print(f\"Decoded w/ special:    {decoded_with_special}\")\n",
+    "print(f\"Decoded w/out special: {decoded_str}\")\n",
+    "print(f\"Are equal:             {input_str == decoded_str}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperProcessor\n",
+    "\n",
+    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", language=\"Vietnamese\", task=\"transcribe\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Audio\n",
+    "\n",
+    "concat = concat.cast_column(\"audio\", Audio(sampling_rate=16000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'audio': {'path': 'vivos/train/waves/VIVOSSPK12/VIVOSSPK12_R077.wav', 'array': array([ 0.00000000e+00,  0.00000000e+00, -3.05175781e-05, ...,\n",
+      "        1.31225586e-03,  1.12915039e-03,  1.55639648e-03]), 'sampling_rate': 16000}, 'sentence': 'KIÊN GIANG'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(concat[\"train\"][4500])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_dataset(batch):\n",
+    "    # load and resample audio data from 48 to 16kHz\n",
+    "    audio = batch[\"audio\"]\n",
+    "\n",
+    "    # compute log-Mel input features from input audio array \n",
+    "    batch[\"input_features\"] = feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
+    "\n",
+    "    # encode target text to label ids \n",
+    "    batch[\"labels\"] = tokenizer(batch[\"sentence\"]).input_ids\n",
+    "    return batch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c35c921e0dde433fb0ef9346310238a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=6):   0%|          | 0/14514 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c5af4ed5f8141d2b0673972f7616941",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=6):   0%|          | 0/760 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "concat = concat.map(prepare_dataset, remove_columns=concat.column_names[\"train\"], num_proc=6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "from dataclasses import dataclass\n",
+    "from typing import Any, Dict, List, Union\n",
+    "\n",
+    "@dataclass\n",
+    "class DataCollatorSpeechSeq2SeqWithPadding:\n",
+    "    processor: Any\n",
+    "\n",
+    "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
+    "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
+    "        # first treat the audio inputs by simply returning torch tensors\n",
+    "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
+    "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
+    "\n",
+    "        # get the tokenized label sequences\n",
+    "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
+    "        # pad the labels to max length\n",
+    "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
+    "\n",
+    "        # replace padding with -100 to ignore loss correctly\n",
+    "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
+    "\n",
+    "        # if bos token is appended in previous tokenization step,\n",
+    "        # cut bos token here as it's append later anyways\n",
+    "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
+    "            labels = labels[:, 1:]\n",
+    "\n",
+    "        batch[\"labels\"] = labels\n",
+    "\n",
+    "        return batch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate\n",
+    "\n",
+    "metric = evaluate.load(\"wer\")\n",
+    "\n",
+    "\n",
+    "def compute_metrics(pred):\n",
+    "    pred_ids = pred.predictions\n",
+    "    label_ids = pred.label_ids\n",
+    "\n",
+    "    # replace -100 with the pad_token_id\n",
+    "    label_ids[label_ids == -100] = tokenizer.pad_token_id\n",
+    "\n",
+    "    # we do not want to group tokens when computing the metrics\n",
+    "    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
+    "    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
+    "\n",
+    "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
+    "\n",
+    "    return {\"wer\": wer}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperForConditionalGeneration\n",
+    "\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.config.forced_decoder_ids = None\n",
+    "model.config.suppress_tokens = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Seq2SeqTrainingArguments\n",
+    "\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./vi_whisper-small\",  # change to a repo name of your choice\n",
+    "    per_device_train_batch_size=16,\n",
+    "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
+    "    learning_rate=1e-4,\n",
+    "    warmup_steps=1000,\n",
+    "    max_steps=8000,\n",
+    "    gradient_checkpointing=True,\n",
+    "    fp16=True,\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    predict_with_generate=True,\n",
+    "    generation_max_length=225,\n",
+    "    save_steps=4000,\n",
+    "    eval_steps=1000,\n",
+    "    logging_steps=25,\n",
+    "    report_to=[\"tensorboard\"],\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=\"wer\",\n",
+    "    greater_is_better=False,\n",
+    "    push_to_hub=True,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/tesla/New Volume1/DEMO/DUY/Vietnamese_ASR/./vi_whisper-small is already a clone of https://huggingface.co/DuyTa/vi_whisper-small. Make sure you pull the latest changes with `repo.git_pull()`.\n"
+     ]
+    },
+    {
+     "ename": "OSError",
+     "evalue": "From https://huggingface.co/DuyTa/vi_whisper-small\n   d7893fc..47c00b5  main       -> origin/main\nhint: You have divergent branches and need to specify how to reconcile them.\nhint: You can do so by running one of the following commands sometime before\nhint: your next pull:\nhint: \nhint:   git config pull.rebase false  # merge (the default strategy)\nhint:   git config pull.rebase true   # rebase\nhint:   git config pull.ff only       # fast-forward only\nhint: \nhint: You can replace \"git config\" with \"git config --global\" to set a default\nhint: preference for all repositories. You can also pass --rebase, --no-rebase,\nhint: or --ff-only on the command line to override the configured default per\nhint: invocation.\nfatal: Need to specify how to reconcile divergent branches.\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mCalledProcessError\u001b[0m                        Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/repository.py:984\u001b[0m, in \u001b[0;36mRepository.git_pull\u001b[0;34m(self, rebase, lfs)\u001b[0m\n\u001b[1;32m    983\u001b[0m \u001b[39mwith\u001b[39;00m _lfs_log_progress():\n\u001b[0;32m--> 984\u001b[0m     result \u001b[39m=\u001b[39m run_subprocess(command, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mlocal_dir)\n\u001b[1;32m    985\u001b[0m     logger\u001b[39m.\u001b[39minfo(result\u001b[39m.\u001b[39mstdout)\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/utils/_subprocess.py:83\u001b[0m, in \u001b[0;36mrun_subprocess\u001b[0;34m(command, folder, check, **kwargs)\u001b[0m\n\u001b[1;32m     81\u001b[0m     folder \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(folder)\n\u001b[0;32m---> 83\u001b[0m \u001b[39mreturn\u001b[39;00m subprocess\u001b[39m.\u001b[39;49mrun(\n\u001b[1;32m     84\u001b[0m     command,\n\u001b[1;32m     85\u001b[0m     stderr\u001b[39m=\u001b[39;49msubprocess\u001b[39m.\u001b[39;49mPIPE,\n\u001b[1;32m     86\u001b[0m     stdout\u001b[39m=\u001b[39;49msubprocess\u001b[39m.\u001b[39;49mPIPE,\n\u001b[1;32m     87\u001b[0m     check\u001b[39m=\u001b[39;49mcheck,\n\u001b[1;32m     88\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mutf-8\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m     89\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mreplace\u001b[39;49m\u001b[39m\"\u001b[39;49m,  \u001b[39m# if not utf-8, replace char by �\u001b[39;49;00m\n\u001b[1;32m     90\u001b[0m     cwd\u001b[39m=\u001b[39;49mfolder \u001b[39mor\u001b[39;49;00m os\u001b[39m.\u001b[39;49mgetcwd(),\n\u001b[1;32m     91\u001b[0m     \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m     92\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/subprocess.py:528\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m    527\u001b[0m     \u001b[39mif\u001b[39;00m check \u001b[39mand\u001b[39;00m retcode:\n\u001b[0;32m--> 528\u001b[0m         \u001b[39mraise\u001b[39;00m CalledProcessError(retcode, process\u001b[39m.\u001b[39margs,\n\u001b[1;32m    529\u001b[0m                                  output\u001b[39m=\u001b[39mstdout, stderr\u001b[39m=\u001b[39mstderr)\n\u001b[1;32m    530\u001b[0m \u001b[39mreturn\u001b[39;00m CompletedProcess(process\u001b[39m.\u001b[39margs, retcode, stdout, stderr)\n",
+      "\u001b[0;31mCalledProcessError\u001b[0m: Command '['git', 'pull']' returned non-zero exit status 128.",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[126], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtransformers\u001b[39;00m \u001b[39mimport\u001b[39;00m Seq2SeqTrainer\n\u001b[0;32m----> 3\u001b[0m trainer \u001b[39m=\u001b[39m Seq2SeqTrainer(\n\u001b[1;32m      4\u001b[0m     args\u001b[39m=\u001b[39;49mtraining_args,\n\u001b[1;32m      5\u001b[0m     model\u001b[39m=\u001b[39;49mmodel,\n\u001b[1;32m      6\u001b[0m     train_dataset\u001b[39m=\u001b[39;49mconcat[\u001b[39m\"\u001b[39;49m\u001b[39mtrain\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m      7\u001b[0m \n\u001b[1;32m      8\u001b[0m \n\u001b[1;32m      9\u001b[0m \n\u001b[1;32m     10\u001b[0m \n\u001b[1;32m     11\u001b[0m     eval_dataset\u001b[39m=\u001b[39;49mconcat[\u001b[39m\"\u001b[39;49m\u001b[39mtest\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m     12\u001b[0m     data_collator\u001b[39m=\u001b[39;49mdata_collator,\n\u001b[1;32m     13\u001b[0m     compute_metrics\u001b[39m=\u001b[39;49mcompute_metrics,\n\u001b[1;32m     14\u001b[0m     tokenizer\u001b[39m=\u001b[39;49mprocessor\u001b[39m.\u001b[39;49mfeature_extractor,\n\u001b[1;32m     15\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/trainer_seq2seq.py:56\u001b[0m, in \u001b[0;36mSeq2SeqTrainer.__init__\u001b[0;34m(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[1;32m     43\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m     44\u001b[0m     model: Union[\u001b[39m\"\u001b[39m\u001b[39mPreTrainedModel\u001b[39m\u001b[39m\"\u001b[39m, nn\u001b[39m.\u001b[39mModule] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     54\u001b[0m     preprocess_logits_for_metrics: Optional[Callable[[torch\u001b[39m.\u001b[39mTensor, torch\u001b[39m.\u001b[39mTensor], torch\u001b[39m.\u001b[39mTensor]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m     55\u001b[0m ):\n\u001b[0;32m---> 56\u001b[0m     \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__init__\u001b[39;49m(\n\u001b[1;32m     57\u001b[0m         model\u001b[39m=\u001b[39;49mmodel,\n\u001b[1;32m     58\u001b[0m         args\u001b[39m=\u001b[39;49margs,\n\u001b[1;32m     59\u001b[0m         data_collator\u001b[39m=\u001b[39;49mdata_collator,\n\u001b[1;32m     60\u001b[0m         train_dataset\u001b[39m=\u001b[39;49mtrain_dataset,\n\u001b[1;32m     61\u001b[0m         eval_dataset\u001b[39m=\u001b[39;49meval_dataset,\n\u001b[1;32m     62\u001b[0m         tokenizer\u001b[39m=\u001b[39;49mtokenizer,\n\u001b[1;32m     63\u001b[0m         model_init\u001b[39m=\u001b[39;49mmodel_init,\n\u001b[1;32m     64\u001b[0m         compute_metrics\u001b[39m=\u001b[39;49mcompute_metrics,\n\u001b[1;32m     65\u001b[0m         callbacks\u001b[39m=\u001b[39;49mcallbacks,\n\u001b[1;32m     66\u001b[0m         optimizers\u001b[39m=\u001b[39;49moptimizers,\n\u001b[1;32m     67\u001b[0m         preprocess_logits_for_metrics\u001b[39m=\u001b[39;49mpreprocess_logits_for_metrics,\n\u001b[1;32m     68\u001b[0m     )\n\u001b[1;32m     70\u001b[0m     \u001b[39m# Override self.model.generation_config if a GenerationConfig is specified in args.\u001b[39;00m\n\u001b[1;32m     71\u001b[0m     \u001b[39m# Priority: args.generation_config > model.generation_config > default GenerationConfig.\u001b[39;00m\n\u001b[1;32m     72\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mgeneration_config \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/trainer.py:551\u001b[0m, in \u001b[0;36mTrainer.__init__\u001b[0;34m(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)\u001b[0m\n\u001b[1;32m    549\u001b[0m \u001b[39m# Create clone of distant repo and output directory if needed\u001b[39;00m\n\u001b[1;32m    550\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mpush_to_hub:\n\u001b[0;32m--> 551\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minit_git_repo(at_init\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m    552\u001b[0m     \u001b[39m# In case of pull, we need to make sure every process has the latest.\u001b[39;00m\n\u001b[1;32m    553\u001b[0m     \u001b[39mif\u001b[39;00m is_torch_tpu_available():\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/trainer.py:3449\u001b[0m, in \u001b[0;36mTrainer.init_git_repo\u001b[0;34m(self, at_init)\u001b[0m\n\u001b[1;32m   3446\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   3447\u001b[0m         \u001b[39mraise\u001b[39;00m\n\u001b[0;32m-> 3449\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrepo\u001b[39m.\u001b[39;49mgit_pull()\n\u001b[1;32m   3451\u001b[0m \u001b[39m# By default, ignore the checkpoint folders\u001b[39;00m\n\u001b[1;32m   3452\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m   3453\u001b[0m     \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mexists(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39moutput_dir, \u001b[39m\"\u001b[39m\u001b[39m.gitignore\u001b[39m\u001b[39m\"\u001b[39m))\n\u001b[1;32m   3454\u001b[0m     \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mhub_strategy \u001b[39m!=\u001b[39m HubStrategy\u001b[39m.\u001b[39mALL_CHECKPOINTS\n\u001b[1;32m   3455\u001b[0m ):\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/repository.py:987\u001b[0m, in \u001b[0;36mRepository.git_pull\u001b[0;34m(self, rebase, lfs)\u001b[0m\n\u001b[1;32m    985\u001b[0m         logger\u001b[39m.\u001b[39minfo(result\u001b[39m.\u001b[39mstdout)\n\u001b[1;32m    986\u001b[0m \u001b[39mexcept\u001b[39;00m subprocess\u001b[39m.\u001b[39mCalledProcessError \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m--> 987\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mEnvironmentError\u001b[39;00m(exc\u001b[39m.\u001b[39mstderr)\n",
+      "\u001b[0;31mOSError\u001b[0m: From https://huggingface.co/DuyTa/vi_whisper-small\n   d7893fc..47c00b5  main       -> origin/main\nhint: You have divergent branches and need to specify how to reconcile them.\nhint: You can do so by running one of the following commands sometime before\nhint: your next pull:\nhint: \nhint:   git config pull.rebase false  # merge (the default strategy)\nhint:   git config pull.rebase true   # rebase\nhint:   git config pull.ff only       # fast-forward only\nhint: \nhint: You can replace \"git config\" with \"git config --global\" to set a default\nhint: preference for all repositories. You can also pass --rebase, --no-rebase,\nhint: or --ff-only on the command line to override the configured default per\nhint: invocation.\nfatal: Need to specify how to reconcile divergent branches.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import Seq2SeqTrainer\n",
+    "\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    args=training_args,\n",
+    "    model=model,\n",
+    "    train_dataset=concat[\"train\"],\n",
+    "\n",
+    "    eval_dataset=concat[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=compute_metrics,\n",
+    "    tokenizer=processor.feature_extractor,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('./vi_whisper-small/tokenizer_config.json',\n",
+       " './vi_whisper-small/special_tokens_map.json',\n",
+       " './vi_whisper-small/vocab.json',\n",
+       " './vi_whisper-small/merges.txt',\n",
+       " './vi_whisper-small/normalizer.json',\n",
+       " './vi_whisper-small/added_tokens.json',\n",
+       " './vi_whisper-small/tokenizer.json')"
+      ]
+     },
+     "execution_count": 130,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.save_pretrained(\"./vi_whisper-small/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Device 0:\n",
+      "  Currently allocated memory: 922.884765625 MB\n",
+      "  Peak memory usage: 922.884765625 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "device_count = torch.cuda.device_count()\n",
+    "\n",
+    "for device in range(device_count):\n",
+    "    torch.cuda.device(device)\n",
+    "    allocated_memory = torch.cuda.memory_allocated(device)\n",
+    "    peak_memory = torch.cuda.max_memory_allocated(device)\n",
+    "    print(f\"Device {device}:\")\n",
+    "    print(f\"  Currently allocated memory: {allocated_memory / 1024**2} MB\")\n",
+    "    print(f\"  Peak memory usage: {peak_memory / 1024**2} MB\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Device 0:\n",
+      "  Name: Tesla T4\n",
+      "  Max Memory: 14966.375 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "device_count = torch.cuda.device_count()\n",
+    "\n",
+    "for device in range(device_count):\n",
+    "    properties = torch.cuda.get_device_properties(device)\n",
+    "    print(f\"Device {device}:\")\n",
+    "    print(f\"  Name: {properties.name}\")\n",
+    "    print(f\"  Max Memory: {properties.total_memory / 1024**2} MB\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da5a536a979e4d34bc59eaede9204d06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/8000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 3.8537, 'learning_rate': 2.1000000000000002e-06, 'epoch': 0.03}\n",
+      "{'loss': 2.2347, 'learning_rate': 4.6e-06, 'epoch': 0.06}\n",
+      "{'loss': 1.2627, 'learning_rate': 7.1e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.8976, 'learning_rate': 9.600000000000001e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.7313, 'learning_rate': 1.2100000000000001e-05, 'epoch': 0.14}\n",
+      "{'loss': 0.6526, 'learning_rate': 1.4599999999999999e-05, 'epoch': 0.17}\n",
+      "{'loss': 0.7221, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.19}\n",
+      "{'loss': 0.6478, 'learning_rate': 1.9500000000000003e-05, 'epoch': 0.22}\n",
+      "{'loss': 1.7029, 'learning_rate': 2.19e-05, 'epoch': 0.25}\n",
+      "{'loss': 1.1476, 'learning_rate': 2.44e-05, 'epoch': 0.28}\n",
+      "{'loss': 0.5837, 'learning_rate': 2.6900000000000003e-05, 'epoch': 0.3}\n",
+      "{'loss': 0.5912, 'learning_rate': 2.94e-05, 'epoch': 0.33}\n",
+      "{'loss': 0.6872, 'learning_rate': 3.19e-05, 'epoch': 0.36}\n",
+      "{'loss': 0.4103, 'learning_rate': 3.4399999999999996e-05, 'epoch': 0.39}\n",
+      "{'loss': 0.4293, 'learning_rate': 3.69e-05, 'epoch': 0.41}\n",
+      "{'loss': 0.3055, 'learning_rate': 3.94e-05, 'epoch': 0.44}\n",
+      "{'loss': 0.311, 'learning_rate': 4.19e-05, 'epoch': 0.47}\n",
+      "{'loss': 0.3212, 'learning_rate': 4.44e-05, 'epoch': 0.5}\n",
+      "{'loss': 0.2917, 'learning_rate': 4.69e-05, 'epoch': 0.52}\n",
+      "{'loss': 0.2975, 'learning_rate': 4.94e-05, 'epoch': 0.55}\n",
+      "{'loss': 0.3254, 'learning_rate': 5.19e-05, 'epoch': 0.58}\n",
+      "{'loss': 0.2825, 'learning_rate': 5.440000000000001e-05, 'epoch': 0.61}\n",
+      "{'loss': 0.2929, 'learning_rate': 5.69e-05, 'epoch': 0.63}\n",
+      "{'loss': 0.3056, 'learning_rate': 5.94e-05, 'epoch': 0.66}\n",
+      "{'loss': 0.3105, 'learning_rate': 6.19e-05, 'epoch': 0.69}\n",
+      "{'loss': 0.3702, 'learning_rate': 6.440000000000001e-05, 'epoch': 0.72}\n",
+      "{'loss': 0.2684, 'learning_rate': 6.690000000000001e-05, 'epoch': 0.74}\n",
+      "{'loss': 0.2767, 'learning_rate': 6.939999999999999e-05, 'epoch': 0.77}\n",
+      "{'loss': 0.315, 'learning_rate': 7.19e-05, 'epoch': 0.8}\n",
+      "{'loss': 0.3132, 'learning_rate': 7.44e-05, 'epoch': 0.83}\n",
+      "{'loss': 0.3933, 'learning_rate': 7.69e-05, 'epoch': 0.85}\n",
+      "{'loss': 0.311, 'learning_rate': 7.94e-05, 'epoch': 0.88}\n",
+      "{'loss': 0.3104, 'learning_rate': 8.19e-05, 'epoch': 0.91}\n",
+      "{'loss': 0.297, 'learning_rate': 8.44e-05, 'epoch': 0.94}\n",
+      "{'loss': 0.3094, 'learning_rate': 8.69e-05, 'epoch': 0.96}\n",
+      "{'loss': 0.29, 'learning_rate': 8.94e-05, 'epoch': 0.99}\n",
+      "{'loss': 0.2712, 'learning_rate': 9.190000000000001e-05, 'epoch': 1.02}\n",
+      "{'loss': 0.262, 'learning_rate': 9.44e-05, 'epoch': 1.05}\n",
+      "{'loss': 0.2481, 'learning_rate': 9.69e-05, 'epoch': 1.07}\n",
+      "{'loss': 0.249, 'learning_rate': 9.94e-05, 'epoch': 1.1}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea1e7cf193cc4b5dacd0883200ff6ef6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.3765707015991211, 'eval_wer': 32.16783216783217, 'eval_runtime': 349.1035, 'eval_samples_per_second': 2.177, 'eval_steps_per_second': 0.272, 'epoch': 1.1}\n",
+      "{'loss': 0.2729, 'learning_rate': 9.972857142857144e-05, 'epoch': 1.13}\n",
+      "{'loss': 0.267, 'learning_rate': 9.937142857142857e-05, 'epoch': 1.16}\n",
+      "{'loss': 0.2617, 'learning_rate': 9.901428571428571e-05, 'epoch': 1.18}\n",
+      "{'loss': 0.2613, 'learning_rate': 9.865714285714286e-05, 'epoch': 1.21}\n",
+      "{'loss': 0.2736, 'learning_rate': 9.83e-05, 'epoch': 1.24}\n",
+      "{'loss': 0.245, 'learning_rate': 9.794285714285714e-05, 'epoch': 1.27}\n",
+      "{'loss': 0.2385, 'learning_rate': 9.75857142857143e-05, 'epoch': 1.29}\n",
+      "{'loss': 0.258, 'learning_rate': 9.722857142857144e-05, 'epoch': 1.32}\n",
+      "{'loss': 0.2623, 'learning_rate': 9.687142857142858e-05, 'epoch': 1.35}\n",
+      "{'loss': 0.2346, 'learning_rate': 9.651428571428572e-05, 'epoch': 1.38}\n",
+      "{'loss': 0.2376, 'learning_rate': 9.615714285714286e-05, 'epoch': 1.4}\n",
+      "{'loss': 0.246, 'learning_rate': 9.58e-05, 'epoch': 1.43}\n",
+      "{'loss': 0.2201, 'learning_rate': 9.544285714285715e-05, 'epoch': 1.46}\n",
+      "{'loss': 0.2233, 'learning_rate': 9.508571428571429e-05, 'epoch': 1.49}\n",
+      "{'loss': 0.2154, 'learning_rate': 9.472857142857143e-05, 'epoch': 1.51}\n",
+      "{'loss': 0.2348, 'learning_rate': 9.437142857142857e-05, 'epoch': 1.54}\n",
+      "{'loss': 0.2159, 'learning_rate': 9.401428571428572e-05, 'epoch': 1.57}\n",
+      "{'loss': 0.2265, 'learning_rate': 9.365714285714286e-05, 'epoch': 1.6}\n",
+      "{'loss': 0.2118, 'learning_rate': 9.33e-05, 'epoch': 1.62}\n",
+      "{'loss': 0.2223, 'learning_rate': 9.294285714285714e-05, 'epoch': 1.65}\n",
+      "{'loss': 0.2, 'learning_rate': 9.258571428571428e-05, 'epoch': 1.68}\n",
+      "{'loss': 0.206, 'learning_rate': 9.222857142857142e-05, 'epoch': 1.71}\n",
+      "{'loss': 0.1979, 'learning_rate': 9.187142857142858e-05, 'epoch': 1.73}\n",
+      "{'loss': 0.2022, 'learning_rate': 9.151428571428572e-05, 'epoch': 1.76}\n",
+      "{'loss': 0.2028, 'learning_rate': 9.115714285714286e-05, 'epoch': 1.79}\n",
+      "{'loss': 0.2161, 'learning_rate': 9.080000000000001e-05, 'epoch': 1.82}\n",
+      "{'loss': 0.1964, 'learning_rate': 9.044285714285715e-05, 'epoch': 1.84}\n",
+      "{'loss': 0.2151, 'learning_rate': 9.008571428571429e-05, 'epoch': 1.87}\n",
+      "{'loss': 0.2056, 'learning_rate': 8.972857142857143e-05, 'epoch': 1.9}\n",
+      "{'loss': 0.189, 'learning_rate': 8.937142857142857e-05, 'epoch': 1.93}\n",
+      "{'loss': 0.1944, 'learning_rate': 8.901428571428571e-05, 'epoch': 1.95}\n",
+      "{'loss': 0.1834, 'learning_rate': 8.865714285714287e-05, 'epoch': 1.98}\n",
+      "{'loss': 0.1557, 'learning_rate': 8.83e-05, 'epoch': 2.01}\n",
+      "{'loss': 0.1337, 'learning_rate': 8.794285714285714e-05, 'epoch': 2.04}\n",
+      "{'loss': 0.1338, 'learning_rate': 8.75857142857143e-05, 'epoch': 2.06}\n",
+      "{'loss': 0.1338, 'learning_rate': 8.722857142857144e-05, 'epoch': 2.09}\n",
+      "{'loss': 0.1385, 'learning_rate': 8.687142857142856e-05, 'epoch': 2.12}\n",
+      "{'loss': 0.1259, 'learning_rate': 8.651428571428572e-05, 'epoch': 2.15}\n",
+      "{'loss': 0.1268, 'learning_rate': 8.615714285714286e-05, 'epoch': 2.18}\n",
+      "{'loss': 0.1416, 'learning_rate': 8.58e-05, 'epoch': 2.2}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b48a513542d43558d8fef6a8ef00629",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.2880653738975525, 'eval_wer': 46.464646464646464, 'eval_runtime': 337.0754, 'eval_samples_per_second': 2.255, 'eval_steps_per_second': 0.282, 'epoch': 2.2}\n",
+      "{'loss': 0.1271, 'learning_rate': 8.544285714285715e-05, 'epoch': 2.23}\n",
+      "{'loss': 0.1345, 'learning_rate': 8.508571428571429e-05, 'epoch': 2.26}\n",
+      "{'loss': 0.149, 'learning_rate': 8.472857142857143e-05, 'epoch': 2.29}\n",
+      "{'loss': 0.1289, 'learning_rate': 8.437142857142859e-05, 'epoch': 2.31}\n",
+      "{'loss': 0.1391, 'learning_rate': 8.401428571428573e-05, 'epoch': 2.34}\n",
+      "{'loss': 0.1532, 'learning_rate': 8.365714285714285e-05, 'epoch': 2.37}\n",
+      "{'loss': 0.1283, 'learning_rate': 8.33e-05, 'epoch': 2.4}\n",
+      "{'loss': 0.1336, 'learning_rate': 8.294285714285715e-05, 'epoch': 2.42}\n",
+      "{'loss': 0.129, 'learning_rate': 8.258571428571429e-05, 'epoch': 2.45}\n",
+      "{'loss': 0.1399, 'learning_rate': 8.222857142857144e-05, 'epoch': 2.48}\n",
+      "{'loss': 0.1411, 'learning_rate': 8.187142857142858e-05, 'epoch': 2.51}\n",
+      "{'loss': 0.1298, 'learning_rate': 8.151428571428572e-05, 'epoch': 2.53}\n",
+      "{'loss': 0.1397, 'learning_rate': 8.115714285714286e-05, 'epoch': 2.56}\n",
+      "{'loss': 0.1356, 'learning_rate': 8.080000000000001e-05, 'epoch': 2.59}\n",
+      "{'loss': 0.1366, 'learning_rate': 8.044285714285714e-05, 'epoch': 2.62}\n",
+      "{'loss': 0.1331, 'learning_rate': 8.008571428571429e-05, 'epoch': 2.64}\n",
+      "{'loss': 0.1297, 'learning_rate': 7.972857142857143e-05, 'epoch': 2.67}\n",
+      "{'loss': 0.1414, 'learning_rate': 7.937142857142857e-05, 'epoch': 2.7}\n",
+      "{'loss': 0.1189, 'learning_rate': 7.901428571428571e-05, 'epoch': 2.73}\n",
+      "{'loss': 0.1416, 'learning_rate': 7.865714285714287e-05, 'epoch': 2.75}\n",
+      "{'loss': 0.1378, 'learning_rate': 7.83e-05, 'epoch': 2.78}\n",
+      "{'loss': 0.1305, 'learning_rate': 7.794285714285715e-05, 'epoch': 2.81}\n",
+      "{'loss': 0.1571, 'learning_rate': 7.75857142857143e-05, 'epoch': 2.84}\n",
+      "{'loss': 0.1285, 'learning_rate': 7.722857142857143e-05, 'epoch': 2.86}\n",
+      "{'loss': 0.1339, 'learning_rate': 7.687142857142857e-05, 'epoch': 2.89}\n",
+      "{'loss': 0.1216, 'learning_rate': 7.651428571428572e-05, 'epoch': 2.92}\n",
+      "{'loss': 0.1321, 'learning_rate': 7.615714285714286e-05, 'epoch': 2.95}\n",
+      "{'loss': 0.1259, 'learning_rate': 7.58e-05, 'epoch': 2.97}\n",
+      "{'loss': 0.1259, 'learning_rate': 7.544285714285715e-05, 'epoch': 3.0}\n",
+      "{'loss': 0.0851, 'learning_rate': 7.508571428571429e-05, 'epoch': 3.03}\n",
+      "{'loss': 0.0764, 'learning_rate': 7.472857142857143e-05, 'epoch': 3.06}\n",
+      "{'loss': 0.0986, 'learning_rate': 7.438571428571429e-05, 'epoch': 3.08}\n",
+      "{'loss': 0.0883, 'learning_rate': 7.402857142857143e-05, 'epoch': 3.11}\n",
+      "{'loss': 0.0811, 'learning_rate': 7.367142857142858e-05, 'epoch': 3.14}\n",
+      "{'loss': 0.0872, 'learning_rate': 7.331428571428571e-05, 'epoch': 3.17}\n",
+      "{'loss': 0.0872, 'learning_rate': 7.295714285714286e-05, 'epoch': 3.19}\n",
+      "{'loss': 0.0805, 'learning_rate': 7.26e-05, 'epoch': 3.22}\n",
+      "{'loss': 0.0803, 'learning_rate': 7.224285714285714e-05, 'epoch': 3.25}\n",
+      "{'loss': 0.0753, 'learning_rate': 7.188571428571428e-05, 'epoch': 3.28}\n",
+      "{'loss': 0.0839, 'learning_rate': 7.152857142857144e-05, 'epoch': 3.3}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ce95b9d2a270464fbeab22454f214a1d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.279912531375885, 'eval_wer': 22.779072779072777, 'eval_runtime': 345.4945, 'eval_samples_per_second': 2.2, 'eval_steps_per_second': 0.275, 'epoch': 3.3}\n",
+      "{'loss': 0.0885, 'learning_rate': 7.117142857142858e-05, 'epoch': 3.33}\n",
+      "{'loss': 0.0845, 'learning_rate': 7.081428571428572e-05, 'epoch': 3.36}\n",
+      "{'loss': 0.0761, 'learning_rate': 7.045714285714287e-05, 'epoch': 3.39}\n",
+      "{'loss': 0.0756, 'learning_rate': 7.01e-05, 'epoch': 3.41}\n",
+      "{'loss': 0.0859, 'learning_rate': 6.974285714285715e-05, 'epoch': 3.44}\n",
+      "{'loss': 0.0972, 'learning_rate': 6.938571428571429e-05, 'epoch': 3.47}\n",
+      "{'loss': 0.0822, 'learning_rate': 6.902857142857143e-05, 'epoch': 3.5}\n",
+      "{'loss': 0.0892, 'learning_rate': 6.867142857142857e-05, 'epoch': 3.52}\n",
+      "{'loss': 0.0735, 'learning_rate': 6.831428571428572e-05, 'epoch': 3.55}\n",
+      "{'loss': 0.0893, 'learning_rate': 6.795714285714286e-05, 'epoch': 3.58}\n",
+      "{'loss': 0.0869, 'learning_rate': 6.76e-05, 'epoch': 3.61}\n",
+      "{'loss': 0.0877, 'learning_rate': 6.724285714285714e-05, 'epoch': 3.63}\n",
+      "{'loss': 0.07, 'learning_rate': 6.688571428571428e-05, 'epoch': 3.66}\n",
+      "{'loss': 0.0807, 'learning_rate': 6.652857142857142e-05, 'epoch': 3.69}\n",
+      "{'loss': 0.0831, 'learning_rate': 6.617142857142858e-05, 'epoch': 3.72}\n",
+      "{'loss': 0.0836, 'learning_rate': 6.581428571428572e-05, 'epoch': 3.74}\n",
+      "{'loss': 0.0875, 'learning_rate': 6.545714285714286e-05, 'epoch': 3.77}\n",
+      "{'loss': 0.0846, 'learning_rate': 6.510000000000001e-05, 'epoch': 3.8}\n",
+      "{'loss': 0.0779, 'learning_rate': 6.474285714285715e-05, 'epoch': 3.83}\n",
+      "{'loss': 0.0871, 'learning_rate': 6.438571428571429e-05, 'epoch': 3.85}\n",
+      "{'loss': 0.0777, 'learning_rate': 6.402857142857143e-05, 'epoch': 3.88}\n",
+      "{'loss': 0.0856, 'learning_rate': 6.367142857142857e-05, 'epoch': 3.91}\n",
+      "{'loss': 0.083, 'learning_rate': 6.331428571428571e-05, 'epoch': 3.94}\n",
+      "{'loss': 0.0667, 'learning_rate': 6.295714285714286e-05, 'epoch': 3.96}\n",
+      "{'loss': 0.083, 'learning_rate': 6.26e-05, 'epoch': 3.99}\n",
+      "{'loss': 0.0505, 'learning_rate': 6.224285714285714e-05, 'epoch': 4.02}\n",
+      "{'loss': 0.0426, 'learning_rate': 6.18857142857143e-05, 'epoch': 4.05}\n",
+      "{'loss': 0.0453, 'learning_rate': 6.152857142857144e-05, 'epoch': 4.07}\n",
+      "{'loss': 0.0482, 'learning_rate': 6.117142857142858e-05, 'epoch': 4.1}\n",
+      "{'loss': 0.0511, 'learning_rate': 6.081428571428571e-05, 'epoch': 4.13}\n",
+      "{'loss': 0.0583, 'learning_rate': 6.045714285714286e-05, 'epoch': 4.16}\n",
+      "{'loss': 0.0466, 'learning_rate': 6.0100000000000004e-05, 'epoch': 4.19}\n",
+      "{'loss': 0.0502, 'learning_rate': 5.9742857142857144e-05, 'epoch': 4.21}\n",
+      "{'loss': 0.0414, 'learning_rate': 5.938571428571429e-05, 'epoch': 4.24}\n",
+      "{'loss': 0.0501, 'learning_rate': 5.902857142857143e-05, 'epoch': 4.27}\n",
+      "{'loss': 0.0478, 'learning_rate': 5.867142857142858e-05, 'epoch': 4.3}\n",
+      "{'loss': 0.0482, 'learning_rate': 5.8314285714285724e-05, 'epoch': 4.32}\n",
+      "{'loss': 0.0463, 'learning_rate': 5.7957142857142864e-05, 'epoch': 4.35}\n",
+      "{'loss': 0.0513, 'learning_rate': 5.76e-05, 'epoch': 4.38}\n",
+      "{'loss': 0.0546, 'learning_rate': 5.7242857142857144e-05, 'epoch': 4.41}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5b1ec44b408a4de8aa8a3e9702dae453",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.28944167494773865, 'eval_wer': 21.885521885521886, 'eval_runtime': 344.5818, 'eval_samples_per_second': 2.206, 'eval_steps_per_second': 0.276, 'epoch': 4.41}\n",
+      "{'loss': 0.0515, 'learning_rate': 5.6885714285714284e-05, 'epoch': 4.43}\n",
+      "{'loss': 0.0394, 'learning_rate': 5.652857142857143e-05, 'epoch': 4.46}\n",
+      "{'loss': 0.0562, 'learning_rate': 5.617142857142858e-05, 'epoch': 4.49}\n",
+      "{'loss': 0.0532, 'learning_rate': 5.581428571428572e-05, 'epoch': 4.52}\n",
+      "{'loss': 0.0525, 'learning_rate': 5.5457142857142864e-05, 'epoch': 4.54}\n",
+      "{'loss': 0.0553, 'learning_rate': 5.5100000000000004e-05, 'epoch': 4.57}\n",
+      "{'loss': 0.0464, 'learning_rate': 5.474285714285714e-05, 'epoch': 4.6}\n",
+      "{'loss': 0.0425, 'learning_rate': 5.4385714285714284e-05, 'epoch': 4.63}\n",
+      "{'loss': 0.0529, 'learning_rate': 5.402857142857143e-05, 'epoch': 4.65}\n",
+      "{'loss': 0.0534, 'learning_rate': 5.367142857142857e-05, 'epoch': 4.68}\n",
+      "{'loss': 0.0505, 'learning_rate': 5.331428571428572e-05, 'epoch': 4.71}\n",
+      "{'loss': 0.0416, 'learning_rate': 5.295714285714286e-05, 'epoch': 4.74}\n",
+      "{'loss': 0.0438, 'learning_rate': 5.2600000000000005e-05, 'epoch': 4.76}\n",
+      "{'loss': 0.0568, 'learning_rate': 5.224285714285715e-05, 'epoch': 4.79}\n",
+      "{'loss': 0.0519, 'learning_rate': 5.188571428571429e-05, 'epoch': 4.82}\n",
+      "{'loss': 0.0415, 'learning_rate': 5.1528571428571425e-05, 'epoch': 4.85}\n",
+      "{'loss': 0.0502, 'learning_rate': 5.117142857142857e-05, 'epoch': 4.87}\n",
+      "{'loss': 0.0433, 'learning_rate': 5.081428571428571e-05, 'epoch': 4.9}\n",
+      "{'loss': 0.0527, 'learning_rate': 5.045714285714286e-05, 'epoch': 4.93}\n",
+      "{'loss': 0.0434, 'learning_rate': 5.0100000000000005e-05, 'epoch': 4.96}\n",
+      "{'loss': 0.0485, 'learning_rate': 4.9742857142857145e-05, 'epoch': 4.98}\n",
+      "{'loss': 0.0358, 'learning_rate': 4.938571428571429e-05, 'epoch': 5.01}\n",
+      "{'loss': 0.0218, 'learning_rate': 4.902857142857143e-05, 'epoch': 5.04}\n",
+      "{'loss': 0.0245, 'learning_rate': 4.867142857142857e-05, 'epoch': 5.07}\n",
+      "{'loss': 0.0272, 'learning_rate': 4.831428571428572e-05, 'epoch': 5.09}\n",
+      "{'loss': 0.0258, 'learning_rate': 4.795714285714286e-05, 'epoch': 5.12}\n",
+      "{'loss': 0.0228, 'learning_rate': 4.76e-05, 'epoch': 5.15}\n",
+      "{'loss': 0.0275, 'learning_rate': 4.7242857142857145e-05, 'epoch': 5.18}\n",
+      "{'loss': 0.0269, 'learning_rate': 4.6885714285714285e-05, 'epoch': 5.2}\n",
+      "{'loss': 0.0237, 'learning_rate': 4.652857142857143e-05, 'epoch': 5.23}\n",
+      "{'loss': 0.0288, 'learning_rate': 4.617142857142857e-05, 'epoch': 5.26}\n",
+      "{'loss': 0.0269, 'learning_rate': 4.581428571428572e-05, 'epoch': 5.29}\n",
+      "{'loss': 0.0276, 'learning_rate': 4.545714285714286e-05, 'epoch': 5.31}\n",
+      "{'loss': 0.0276, 'learning_rate': 4.5100000000000005e-05, 'epoch': 5.34}\n",
+      "{'loss': 0.0242, 'learning_rate': 4.4742857142857145e-05, 'epoch': 5.37}\n",
+      "{'loss': 0.0238, 'learning_rate': 4.4385714285714285e-05, 'epoch': 5.4}\n",
+      "{'loss': 0.0302, 'learning_rate': 4.402857142857143e-05, 'epoch': 5.42}\n",
+      "{'loss': 0.0253, 'learning_rate': 4.367142857142857e-05, 'epoch': 5.45}\n",
+      "{'loss': 0.0256, 'learning_rate': 4.331428571428572e-05, 'epoch': 5.48}\n",
+      "{'loss': 0.0256, 'learning_rate': 4.295714285714286e-05, 'epoch': 5.51}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a3e7ace41cd44768ebc093aa571360e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.3023395836353302, 'eval_wer': 32.2973322973323, 'eval_runtime': 361.3589, 'eval_samples_per_second': 2.103, 'eval_steps_per_second': 0.263, 'epoch': 5.51}\n",
+      "{'loss': 0.0215, 'learning_rate': 4.26e-05, 'epoch': 5.53}\n",
+      "{'loss': 0.0272, 'learning_rate': 4.2242857142857145e-05, 'epoch': 5.56}\n",
+      "{'loss': 0.0268, 'learning_rate': 4.188571428571429e-05, 'epoch': 5.59}\n",
+      "{'loss': 0.028, 'learning_rate': 4.1528571428571425e-05, 'epoch': 5.62}\n",
+      "{'loss': 0.0209, 'learning_rate': 4.117142857142857e-05, 'epoch': 5.64}\n",
+      "{'loss': 0.0258, 'learning_rate': 4.081428571428572e-05, 'epoch': 5.67}\n",
+      "{'loss': 0.0249, 'learning_rate': 4.045714285714286e-05, 'epoch': 5.7}\n",
+      "{'loss': 0.0249, 'learning_rate': 4.0100000000000006e-05, 'epoch': 5.73}\n",
+      "{'loss': 0.0209, 'learning_rate': 3.9742857142857146e-05, 'epoch': 5.75}\n",
+      "{'loss': 0.02, 'learning_rate': 3.9385714285714286e-05, 'epoch': 5.78}\n",
+      "{'loss': 0.0244, 'learning_rate': 3.902857142857143e-05, 'epoch': 5.81}\n",
+      "{'loss': 0.025, 'learning_rate': 3.867142857142857e-05, 'epoch': 5.84}\n",
+      "{'loss': 0.0282, 'learning_rate': 3.831428571428571e-05, 'epoch': 5.86}\n",
+      "{'loss': 0.0271, 'learning_rate': 3.795714285714286e-05, 'epoch': 5.89}\n",
+      "{'loss': 0.0233, 'learning_rate': 3.76e-05, 'epoch': 5.92}\n",
+      "{'loss': 0.0219, 'learning_rate': 3.7242857142857146e-05, 'epoch': 5.95}\n",
+      "{'loss': 0.0232, 'learning_rate': 3.688571428571429e-05, 'epoch': 5.97}\n",
+      "{'loss': 0.019, 'learning_rate': 3.6528571428571426e-05, 'epoch': 6.0}\n",
+      "{'loss': 0.0152, 'learning_rate': 3.617142857142857e-05, 'epoch': 6.03}\n",
+      "{'loss': 0.0111, 'learning_rate': 3.581428571428572e-05, 'epoch': 6.06}\n",
+      "{'loss': 0.0162, 'learning_rate': 3.545714285714286e-05, 'epoch': 6.08}\n",
+      "{'loss': 0.0126, 'learning_rate': 3.51e-05, 'epoch': 6.11}\n",
+      "{'loss': 0.012, 'learning_rate': 3.4742857142857146e-05, 'epoch': 6.14}\n",
+      "{'loss': 0.0153, 'learning_rate': 3.4385714285714286e-05, 'epoch': 6.17}\n",
+      "{'loss': 0.0133, 'learning_rate': 3.402857142857143e-05, 'epoch': 6.19}\n",
+      "{'loss': 0.0112, 'learning_rate': 3.367142857142857e-05, 'epoch': 6.22}\n",
+      "{'loss': 0.0187, 'learning_rate': 3.331428571428571e-05, 'epoch': 6.25}\n",
+      "{'loss': 0.0134, 'learning_rate': 3.295714285714286e-05, 'epoch': 6.28}\n",
+      "{'loss': 0.0112, 'learning_rate': 3.26e-05, 'epoch': 6.31}\n",
+      "{'loss': 0.0096, 'learning_rate': 3.2242857142857146e-05, 'epoch': 6.33}\n",
+      "{'loss': 0.0112, 'learning_rate': 3.1885714285714286e-05, 'epoch': 6.36}\n",
+      "{'loss': 0.0146, 'learning_rate': 3.1528571428571426e-05, 'epoch': 6.39}\n",
+      "{'loss': 0.0106, 'learning_rate': 3.117142857142857e-05, 'epoch': 6.42}\n",
+      "{'loss': 0.01, 'learning_rate': 3.081428571428572e-05, 'epoch': 6.44}\n",
+      "{'loss': 0.0117, 'learning_rate': 3.0457142857142856e-05, 'epoch': 6.47}\n",
+      "{'loss': 0.0135, 'learning_rate': 3.01e-05, 'epoch': 6.5}\n",
+      "{'loss': 0.0137, 'learning_rate': 2.9742857142857143e-05, 'epoch': 6.53}\n",
+      "{'loss': 0.0089, 'learning_rate': 2.938571428571429e-05, 'epoch': 6.55}\n",
+      "{'loss': 0.0096, 'learning_rate': 2.9028571428571427e-05, 'epoch': 6.58}\n",
+      "{'loss': 0.0111, 'learning_rate': 2.867142857142857e-05, 'epoch': 6.61}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a911edc03d0b43edbd4c59958024dca9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.3060542345046997, 'eval_wer': 31.015281015281015, 'eval_runtime': 366.4932, 'eval_samples_per_second': 2.074, 'eval_steps_per_second': 0.259, 'epoch': 6.61}\n",
+      "{'loss': 0.0091, 'learning_rate': 2.8314285714285717e-05, 'epoch': 6.64}\n",
+      "{'loss': 0.0075, 'learning_rate': 2.795714285714286e-05, 'epoch': 6.66}\n",
+      "{'loss': 0.0096, 'learning_rate': 2.7600000000000003e-05, 'epoch': 6.69}\n",
+      "{'loss': 0.0071, 'learning_rate': 2.7242857142857143e-05, 'epoch': 6.72}\n",
+      "{'loss': 0.0089, 'learning_rate': 2.6885714285714287e-05, 'epoch': 6.75}\n",
+      "{'loss': 0.0103, 'learning_rate': 2.652857142857143e-05, 'epoch': 6.77}\n",
+      "{'loss': 0.0125, 'learning_rate': 2.6171428571428574e-05, 'epoch': 6.8}\n",
+      "{'loss': 0.0082, 'learning_rate': 2.5814285714285713e-05, 'epoch': 6.83}\n",
+      "{'loss': 0.0079, 'learning_rate': 2.5457142857142857e-05, 'epoch': 6.86}\n",
+      "{'loss': 0.0108, 'learning_rate': 2.51e-05, 'epoch': 6.88}\n",
+      "{'loss': 0.0084, 'learning_rate': 2.4742857142857147e-05, 'epoch': 6.91}\n",
+      "{'loss': 0.0107, 'learning_rate': 2.4385714285714287e-05, 'epoch': 6.94}\n",
+      "{'loss': 0.009, 'learning_rate': 2.402857142857143e-05, 'epoch': 6.97}\n",
+      "{'loss': 0.0081, 'learning_rate': 2.3671428571428574e-05, 'epoch': 6.99}\n",
+      "{'loss': 0.0077, 'learning_rate': 2.3314285714285717e-05, 'epoch': 7.02}\n",
+      "{'loss': 0.0064, 'learning_rate': 2.2957142857142857e-05, 'epoch': 7.05}\n",
+      "{'loss': 0.0079, 'learning_rate': 2.26e-05, 'epoch': 7.08}\n",
+      "{'loss': 0.0063, 'learning_rate': 2.2242857142857144e-05, 'epoch': 7.1}\n",
+      "{'loss': 0.0044, 'learning_rate': 2.1885714285714287e-05, 'epoch': 7.13}\n",
+      "{'loss': 0.0041, 'learning_rate': 2.1528571428571427e-05, 'epoch': 7.16}\n",
+      "{'loss': 0.0048, 'learning_rate': 2.1171428571428574e-05, 'epoch': 7.19}\n",
+      "{'loss': 0.0041, 'learning_rate': 2.0814285714285714e-05, 'epoch': 7.21}\n",
+      "{'loss': 0.0031, 'learning_rate': 2.0457142857142857e-05, 'epoch': 7.24}\n",
+      "{'loss': 0.0026, 'learning_rate': 2.01e-05, 'epoch': 7.27}\n",
+      "{'loss': 0.0031, 'learning_rate': 1.9742857142857144e-05, 'epoch': 7.3}\n",
+      "{'loss': 0.0029, 'learning_rate': 1.9385714285714287e-05, 'epoch': 7.32}\n",
+      "{'loss': 0.0045, 'learning_rate': 1.9028571428571427e-05, 'epoch': 7.35}\n",
+      "{'loss': 0.0024, 'learning_rate': 1.8671428571428574e-05, 'epoch': 7.38}\n",
+      "{'loss': 0.002, 'learning_rate': 1.8314285714285714e-05, 'epoch': 7.41}\n",
+      "{'loss': 0.0038, 'learning_rate': 1.7957142857142858e-05, 'epoch': 7.43}\n",
+      "{'loss': 0.0035, 'learning_rate': 1.76e-05, 'epoch': 7.46}\n",
+      "{'loss': 0.0058, 'learning_rate': 1.7242857142857144e-05, 'epoch': 7.49}\n",
+      "{'loss': 0.0034, 'learning_rate': 1.6885714285714284e-05, 'epoch': 7.52}\n",
+      "{'loss': 0.0036, 'learning_rate': 1.652857142857143e-05, 'epoch': 7.54}\n",
+      "{'loss': 0.0031, 'learning_rate': 1.6171428571428574e-05, 'epoch': 7.57}\n",
+      "{'loss': 0.0041, 'learning_rate': 1.5814285714285714e-05, 'epoch': 7.6}\n",
+      "{'loss': 0.0021, 'learning_rate': 1.5457142857142858e-05, 'epoch': 7.63}\n",
+      "{'loss': 0.0032, 'learning_rate': 1.51e-05, 'epoch': 7.65}\n",
+      "{'loss': 0.0039, 'learning_rate': 1.4742857142857144e-05, 'epoch': 7.68}\n",
+      "{'loss': 0.0028, 'learning_rate': 1.4385714285714286e-05, 'epoch': 7.71}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3ff56df660d1427daf8f920cd57d67fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.3143082559108734, 'eval_wer': 27.169127169127172, 'eval_runtime': 357.6908, 'eval_samples_per_second': 2.125, 'eval_steps_per_second': 0.266, 'epoch': 7.71}\n",
+      "{'loss': 0.0032, 'learning_rate': 1.402857142857143e-05, 'epoch': 7.74}\n",
+      "{'loss': 0.0028, 'learning_rate': 1.3671428571428571e-05, 'epoch': 7.76}\n",
+      "{'loss': 0.0033, 'learning_rate': 1.3314285714285715e-05, 'epoch': 7.79}\n",
+      "{'loss': 0.0052, 'learning_rate': 1.2957142857142856e-05, 'epoch': 7.82}\n",
+      "{'loss': 0.0023, 'learning_rate': 1.2600000000000001e-05, 'epoch': 7.85}\n",
+      "{'loss': 0.0034, 'learning_rate': 1.2242857142857143e-05, 'epoch': 7.87}\n",
+      "{'loss': 0.0026, 'learning_rate': 1.1885714285714286e-05, 'epoch': 7.9}\n",
+      "{'loss': 0.0022, 'learning_rate': 1.1528571428571428e-05, 'epoch': 7.93}\n",
+      "{'loss': 0.0037, 'learning_rate': 1.1171428571428571e-05, 'epoch': 7.96}\n",
+      "{'loss': 0.003, 'learning_rate': 1.0814285714285715e-05, 'epoch': 7.98}\n",
+      "{'loss': 0.0009, 'learning_rate': 1.0457142857142856e-05, 'epoch': 8.01}\n",
+      "{'loss': 0.0006, 'learning_rate': 1.0100000000000002e-05, 'epoch': 8.04}\n",
+      "{'loss': 0.0022, 'learning_rate': 9.742857142857143e-06, 'epoch': 8.07}\n",
+      "{'loss': 0.0005, 'learning_rate': 9.385714285714287e-06, 'epoch': 8.09}\n",
+      "{'loss': 0.0007, 'learning_rate': 9.02857142857143e-06, 'epoch': 8.12}\n",
+      "{'loss': 0.0006, 'learning_rate': 8.671428571428572e-06, 'epoch': 8.15}\n",
+      "{'loss': 0.0006, 'learning_rate': 8.314285714285715e-06, 'epoch': 8.18}\n",
+      "{'loss': 0.0005, 'learning_rate': 7.957142857142857e-06, 'epoch': 8.2}\n",
+      "{'loss': 0.0024, 'learning_rate': 7.6e-06, 'epoch': 8.23}\n",
+      "{'loss': 0.0009, 'learning_rate': 7.242857142857143e-06, 'epoch': 8.26}\n",
+      "{'loss': 0.0004, 'learning_rate': 6.885714285714286e-06, 'epoch': 8.29}\n",
+      "{'loss': 0.0006, 'learning_rate': 6.5285714285714285e-06, 'epoch': 8.31}\n",
+      "{'loss': 0.0031, 'learning_rate': 6.171428571428572e-06, 'epoch': 8.34}\n",
+      "{'loss': 0.001, 'learning_rate': 5.814285714285714e-06, 'epoch': 8.37}\n",
+      "{'loss': 0.0013, 'learning_rate': 5.457142857142857e-06, 'epoch': 8.4}\n",
+      "{'loss': 0.0011, 'learning_rate': 5.1e-06, 'epoch': 8.43}\n",
+      "{'loss': 0.0004, 'learning_rate': 4.742857142857144e-06, 'epoch': 8.45}\n",
+      "{'loss': 0.0012, 'learning_rate': 4.385714285714286e-06, 'epoch': 8.48}\n",
+      "{'loss': 0.0013, 'learning_rate': 4.028571428571429e-06, 'epoch': 8.51}\n",
+      "{'loss': 0.0008, 'learning_rate': 3.6714285714285717e-06, 'epoch': 8.54}\n",
+      "{'loss': 0.0009, 'learning_rate': 3.314285714285714e-06, 'epoch': 8.56}\n",
+      "{'loss': 0.0019, 'learning_rate': 2.957142857142857e-06, 'epoch': 8.59}\n",
+      "{'loss': 0.0006, 'learning_rate': 2.6e-06, 'epoch': 8.62}\n",
+      "{'loss': 0.0004, 'learning_rate': 2.242857142857143e-06, 'epoch': 8.65}\n",
+      "{'loss': 0.0005, 'learning_rate': 1.8857142857142858e-06, 'epoch': 8.67}\n",
+      "{'loss': 0.0005, 'learning_rate': 1.5285714285714287e-06, 'epoch': 8.7}\n",
+      "{'loss': 0.001, 'learning_rate': 1.1714285714285715e-06, 'epoch': 8.73}\n",
+      "{'loss': 0.0012, 'learning_rate': 8.142857142857143e-07, 'epoch': 8.76}\n",
+      "{'loss': 0.001, 'learning_rate': 4.571428571428572e-07, 'epoch': 8.78}\n",
+      "{'loss': 0.0014, 'learning_rate': 1.0000000000000001e-07, 'epoch': 8.81}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5120f3a800554c1689dfb561c55440fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/95 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.318661630153656, 'eval_wer': 27.36337736337736, 'eval_runtime': 356.3989, 'eval_samples_per_second': 2.132, 'eval_steps_per_second': 0.267, 'epoch': 8.81}\n",
+      "{'train_runtime': 48216.7702, 'train_samples_per_second': 2.655, 'train_steps_per_second': 0.166, 'train_loss': 0.13303363310021815, 'epoch': 8.81}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=8000, training_loss=0.13303363310021815, metrics={'train_runtime': 48216.7702, 'train_samples_per_second': 2.655, 'train_steps_per_second': 0.166, 'train_loss': 0.13303363310021815, 'epoch': 8.81})"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "It looks like the config file at './whisper-base-vi/pytorch_model.bin' is not a valid JSON file.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/configuration_utils.py:702\u001b[0m, in \u001b[0;36mPretrainedConfig._get_config_dict\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m    700\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    701\u001b[0m     \u001b[39m# Load config dict\u001b[39;00m\n\u001b[0;32m--> 702\u001b[0m     config_dict \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49m_dict_from_json_file(resolved_config_file)\n\u001b[1;32m    703\u001b[0m     config_dict[\u001b[39m\"\u001b[39m\u001b[39m_commit_hash\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m commit_hash\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/configuration_utils.py:793\u001b[0m, in \u001b[0;36mPretrainedConfig._dict_from_json_file\u001b[0;34m(cls, json_file)\u001b[0m\n\u001b[1;32m    792\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(json_file, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m, encoding\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mutf-8\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m reader:\n\u001b[0;32m--> 793\u001b[0m     text \u001b[39m=\u001b[39m reader\u001b[39m.\u001b[39;49mread()\n\u001b[1;32m    794\u001b[0m \u001b[39mreturn\u001b[39;00m json\u001b[39m.\u001b[39mloads(text)\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/codecs.py:322\u001b[0m, in \u001b[0;36mBufferedIncrementalDecoder.decode\u001b[0;34m(self, input, final)\u001b[0m\n\u001b[1;32m    321\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbuffer \u001b[39m+\u001b[39m \u001b[39minput\u001b[39m\n\u001b[0;32m--> 322\u001b[0m (result, consumed) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_buffer_decode(data, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merrors, final)\n\u001b[1;32m    323\u001b[0m \u001b[39m# keep undecoded input until the next call\u001b[39;00m\n",
+      "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0x80 in position 64: invalid start byte",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[34], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m pt_model \u001b[39m=\u001b[39m WhisperForConditionalGeneration\u001b[39m.\u001b[39;49mfrom_pretrained(\u001b[39m\"\u001b[39;49m\u001b[39m./whisper-base-vi/pytorch_model.bin\u001b[39;49m\u001b[39m\"\u001b[39;49m, from_tf\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m      2\u001b[0m pt_model\u001b[39m.\u001b[39msave_pretrained(\u001b[39m\"\u001b[39m\u001b[39m./whisper-base-vi/vi_whisper.pt\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/modeling_utils.py:2325\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   2323\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(config, PretrainedConfig):\n\u001b[1;32m   2324\u001b[0m     config_path \u001b[39m=\u001b[39m config \u001b[39mif\u001b[39;00m config \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m pretrained_model_name_or_path\n\u001b[0;32m-> 2325\u001b[0m     config, model_kwargs \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mconfig_class\u001b[39m.\u001b[39;49mfrom_pretrained(\n\u001b[1;32m   2326\u001b[0m         config_path,\n\u001b[1;32m   2327\u001b[0m         cache_dir\u001b[39m=\u001b[39;49mcache_dir,\n\u001b[1;32m   2328\u001b[0m         return_unused_kwargs\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m   2329\u001b[0m         force_download\u001b[39m=\u001b[39;49mforce_download,\n\u001b[1;32m   2330\u001b[0m         resume_download\u001b[39m=\u001b[39;49mresume_download,\n\u001b[1;32m   2331\u001b[0m         proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m   2332\u001b[0m         local_files_only\u001b[39m=\u001b[39;49mlocal_files_only,\n\u001b[1;32m   2333\u001b[0m         token\u001b[39m=\u001b[39;49mtoken,\n\u001b[1;32m   2334\u001b[0m         revision\u001b[39m=\u001b[39;49mrevision,\n\u001b[1;32m   2335\u001b[0m         subfolder\u001b[39m=\u001b[39;49msubfolder,\n\u001b[1;32m   2336\u001b[0m         _from_auto\u001b[39m=\u001b[39;49mfrom_auto_class,\n\u001b[1;32m   2337\u001b[0m         _from_pipeline\u001b[39m=\u001b[39;49mfrom_pipeline,\n\u001b[1;32m   2338\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m   2339\u001b[0m     )\n\u001b[1;32m   2340\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   2341\u001b[0m     model_kwargs \u001b[39m=\u001b[39m kwargs\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/configuration_utils.py:590\u001b[0m, in \u001b[0;36mPretrainedConfig.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)\u001b[0m\n\u001b[1;32m    586\u001b[0m kwargs[\u001b[39m\"\u001b[39m\u001b[39mrevision\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m revision\n\u001b[1;32m    588\u001b[0m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_set_token_in_kwargs(kwargs, token)\n\u001b[0;32m--> 590\u001b[0m config_dict, kwargs \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mget_config_dict(pretrained_model_name_or_path, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    591\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mmodel_type\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m config_dict \u001b[39mand\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mcls\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mmodel_type\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mand\u001b[39;00m config_dict[\u001b[39m\"\u001b[39m\u001b[39mmodel_type\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m!=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mmodel_type:\n\u001b[1;32m    592\u001b[0m     logger\u001b[39m.\u001b[39mwarning(\n\u001b[1;32m    593\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mYou are using a model of type \u001b[39m\u001b[39m{\u001b[39;00mconfig_dict[\u001b[39m'\u001b[39m\u001b[39mmodel_type\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m}\u001b[39;00m\u001b[39m to instantiate a model of type \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    594\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mmodel_type\u001b[39m}\u001b[39;00m\u001b[39m. This is not supported for all configurations of models and can yield errors.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    595\u001b[0m     )\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/configuration_utils.py:617\u001b[0m, in \u001b[0;36mPretrainedConfig.get_config_dict\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m    615\u001b[0m original_kwargs \u001b[39m=\u001b[39m copy\u001b[39m.\u001b[39mdeepcopy(kwargs)\n\u001b[1;32m    616\u001b[0m \u001b[39m# Get config dict associated with the base config file\u001b[39;00m\n\u001b[0;32m--> 617\u001b[0m config_dict, kwargs \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49m_get_config_dict(pretrained_model_name_or_path, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    618\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m_commit_hash\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m config_dict:\n\u001b[1;32m    619\u001b[0m     original_kwargs[\u001b[39m\"\u001b[39m\u001b[39m_commit_hash\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m config_dict[\u001b[39m\"\u001b[39m\u001b[39m_commit_hash\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/configuration_utils.py:705\u001b[0m, in \u001b[0;36mPretrainedConfig._get_config_dict\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m    703\u001b[0m     config_dict[\u001b[39m\"\u001b[39m\u001b[39m_commit_hash\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m commit_hash\n\u001b[1;32m    704\u001b[0m \u001b[39mexcept\u001b[39;00m (json\u001b[39m.\u001b[39mJSONDecodeError, \u001b[39mUnicodeDecodeError\u001b[39;00m):\n\u001b[0;32m--> 705\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mEnvironmentError\u001b[39;00m(\n\u001b[1;32m    706\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIt looks like the config file at \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mresolved_config_file\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m is not a valid JSON file.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    707\u001b[0m     )\n\u001b[1;32m    709\u001b[0m \u001b[39mif\u001b[39;00m is_local:\n\u001b[1;32m    710\u001b[0m     logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mloading configuration file \u001b[39m\u001b[39m{\u001b[39;00mresolved_config_file\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n",
+      "\u001b[0;31mOSError\u001b[0m: It looks like the config file at './whisper-base-vi/pytorch_model.bin' is not a valid JSON file."
+     ]
+    }
+   ],
+   "source": [
+    "pt_model = WhisperForConditionalGeneration.from_pretrained(\"./whisper-base-vi/pytorch_model.bin\", from_tf=True)\n",
+    "pt_model.save_pretrained(\"./whisper-base-vi/vi_whisper.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kwargs = {\n",
+    "    \"dataset_tags\": \"vivos-commonvoice\",\n",
+    "    \"dataset\": \"Vivos\",  \n",
+    "    \"language\": \"vi\",\n",
+    "    \"model_name\": \"Whisper Small Vi - Duy Ta\", \n",
+    "    \"finetuned_from\": \"openai/whisper-small\",\n",
+    "    \"tasks\": \"automatic-speech-recognition\",\n",
+    "    \"config\" : None\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Several commits (2) will be pushed upstream.\n",
+      "The progress bars may be unreliable.\n",
+      "error: The destination you provided is not a full refname (i.e.,\n",
+      "starting with \"refs/\"). We tried to guess what you meant by:\n",
+      "\n",
+      "- Looking for a ref that matches 'HEAD' on the remote side.\n",
+      "- Checking if the <src> being pushed ('HEAD')\n",
+      "  is a ref in \"refs/{heads,tags}/\". If so we add a corresponding\n",
+      "  refs/{heads,tags}/ prefix on the remote side.\n",
+      "\n",
+      "Neither worked, so we gave up. You must fully qualify the ref.\n",
+      "hint: The <src> part of the refspec is a commit object.\n",
+      "hint: Did you mean to create a new branch by pushing to\n",
+      "hint: 'HEAD:refs/heads/HEAD'?\n",
+      "error: failed to push some refs to 'https://huggingface.co/DuyTa/vi_whisper-small'\n",
+      "\n"
+     ]
+    },
+    {
+     "ename": "OSError",
+     "evalue": "error: The destination you provided is not a full refname (i.e.,\nstarting with \"refs/\"). We tried to guess what you meant by:\n\n- Looking for a ref that matches 'HEAD' on the remote side.\n- Checking if the <src> being pushed ('HEAD')\n  is a ref in \"refs/{heads,tags}/\". If so we add a corresponding\n  refs/{heads,tags}/ prefix on the remote side.\n\nNeither worked, so we gave up. You must fully qualify the ref.\nhint: The <src> part of the refspec is a commit object.\nhint: Did you mean to create a new branch by pushing to\nhint: 'HEAD:refs/heads/HEAD'?\nerror: failed to push some refs to 'https://huggingface.co/DuyTa/vi_whisper-small'\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mCalledProcessError\u001b[0m                        Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/repository.py:1099\u001b[0m, in \u001b[0;36mRepository.git_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m   1098\u001b[0m             \u001b[39mif\u001b[39;00m return_code:\n\u001b[0;32m-> 1099\u001b[0m                 \u001b[39mraise\u001b[39;00m subprocess\u001b[39m.\u001b[39mCalledProcessError(return_code, process\u001b[39m.\u001b[39margs, output\u001b[39m=\u001b[39mstdout, stderr\u001b[39m=\u001b[39mstderr)\n\u001b[1;32m   1101\u001b[0m \u001b[39mexcept\u001b[39;00m subprocess\u001b[39m.\u001b[39mCalledProcessError \u001b[39mas\u001b[39;00m exc:\n",
+      "\u001b[0;31mCalledProcessError\u001b[0m: Command '['git', 'push', '--set-upstream', 'origin', 'HEAD']' returned non-zero exit status 1.",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[131], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m trainer\u001b[39m.\u001b[39;49mpush_to_hub(commit_message\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mchange\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/trainer.py:3609\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m   3606\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpush_in_progress\u001b[39m.\u001b[39m_process\u001b[39m.\u001b[39mkill()\n\u001b[1;32m   3607\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpush_in_progress \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 3609\u001b[0m git_head_commit_url \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrepo\u001b[39m.\u001b[39;49mpush_to_hub(\n\u001b[1;32m   3610\u001b[0m     commit_message\u001b[39m=\u001b[39;49mcommit_message, blocking\u001b[39m=\u001b[39;49mblocking, auto_lfs_prune\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m\n\u001b[1;32m   3611\u001b[0m )\n\u001b[1;32m   3612\u001b[0m \u001b[39m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m   3613\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mshould_save:\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/repository.py:1307\u001b[0m, in \u001b[0;36mRepository.push_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m   1305\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mgit_add(auto_lfs_track\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m   1306\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mgit_commit(commit_message)\n\u001b[0;32m-> 1307\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgit_push(\n\u001b[1;32m   1308\u001b[0m     upstream\u001b[39m=\u001b[39;49m\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39morigin \u001b[39;49m\u001b[39m{\u001b[39;49;00m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcurrent_branch\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m   1309\u001b[0m     blocking\u001b[39m=\u001b[39;49mblocking,\n\u001b[1;32m   1310\u001b[0m     auto_lfs_prune\u001b[39m=\u001b[39;49mauto_lfs_prune,\n\u001b[1;32m   1311\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/repository.py:1102\u001b[0m, in \u001b[0;36mRepository.git_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m   1099\u001b[0m                 \u001b[39mraise\u001b[39;00m subprocess\u001b[39m.\u001b[39mCalledProcessError(return_code, process\u001b[39m.\u001b[39margs, output\u001b[39m=\u001b[39mstdout, stderr\u001b[39m=\u001b[39mstderr)\n\u001b[1;32m   1101\u001b[0m \u001b[39mexcept\u001b[39;00m subprocess\u001b[39m.\u001b[39mCalledProcessError \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m-> 1102\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mEnvironmentError\u001b[39;00m(exc\u001b[39m.\u001b[39mstderr)\n\u001b[1;32m   1104\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m blocking:\n\u001b[1;32m   1106\u001b[0m     \u001b[39mdef\u001b[39;00m \u001b[39mstatus_method\u001b[39m():\n",
+      "\u001b[0;31mOSError\u001b[0m: error: The destination you provided is not a full refname (i.e.,\nstarting with \"refs/\"). We tried to guess what you meant by:\n\n- Looking for a ref that matches 'HEAD' on the remote side.\n- Checking if the <src> being pushed ('HEAD')\n  is a ref in \"refs/{heads,tags}/\". If so we add a corresponding\n  refs/{heads,tags}/ prefix on the remote side.\n\nNeither worked, so we gave up. You must fully qualify the ref.\nhint: The <src> part of the refspec is a commit object.\nhint: Did you mean to create a new branch by pushing to\nhint: 'HEAD:refs/heads/HEAD'?\nerror: failed to push some refs to 'https://huggingface.co/DuyTa/vi_whisper-small'\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub(commit_message=\"change\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6e666bab7b2450abf3e2adf07679122",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b212026dca9241cf994f9710f0b93c22",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)okenizer_config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import WhisperForConditionalGeneration, WhisperProcessor\n",
+    "\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\"DuyTa/vi_whisper\")\n",
+    "processor = WhisperProcessor.from_pretrained(\"DuyTa/vi_whisper\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Instantiating a pipeline without a task set raised an error: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './vi_whisper-small'.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mHFValidationError\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/__init__.py:432\u001b[0m, in \u001b[0;36mget_task\u001b[0;34m(model, use_auth_token)\u001b[0m\n\u001b[1;32m    431\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 432\u001b[0m     info \u001b[39m=\u001b[39m model_info(model, token\u001b[39m=\u001b[39;49muse_auth_token)\n\u001b[1;32m    433\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:110\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m \u001b[39mif\u001b[39;00m arg_name \u001b[39min\u001b[39;00m [\u001b[39m\"\u001b[39m\u001b[39mrepo_id\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mfrom_id\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mto_id\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m--> 110\u001b[0m     validate_repo_id(arg_value)\n\u001b[1;32m    112\u001b[0m \u001b[39melif\u001b[39;00m arg_name \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtoken\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m arg_value \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:164\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[0;34m(repo_id)\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m REPO_ID_REGEX\u001b[39m.\u001b[39mmatch(repo_id):\n\u001b[0;32m--> 164\u001b[0m     \u001b[39mraise\u001b[39;00m HFValidationError(\n\u001b[1;32m    165\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mRepo id must use alphanumeric chars or \u001b[39m\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m\u001b[39m, \u001b[39m\u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m'\u001b[39m\u001b[39m, \u001b[39m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39m\u001b[39m'\u001b[39m\u001b[39m, \u001b[39m\u001b[39m'\u001b[39m\u001b[39m--\u001b[39m\u001b[39m'\u001b[39m\u001b[39m and \u001b[39m\u001b[39m'\u001b[39m\u001b[39m..\u001b[39m\u001b[39m'\u001b[39m\u001b[39m are\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    166\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39m forbidden, \u001b[39m\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m\u001b[39m and \u001b[39m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39m\u001b[39m'\u001b[39m\u001b[39m cannot start or end the name, max length is 96:\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    167\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mrepo_id\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    168\u001b[0m     )\n\u001b[1;32m    170\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m--\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m repo_id \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m..\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m repo_id:\n",
+      "\u001b[0;31mHFValidationError\u001b[0m: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './vi_whisper-small'.",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[36], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtransformers\u001b[39;00m \u001b[39mimport\u001b[39;00m pipeline\n\u001b[1;32m      2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mgradio\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mgr\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m pipe \u001b[39m=\u001b[39m pipeline(model\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m./vi_whisper-small\u001b[39;49m\u001b[39m\"\u001b[39;49m)  \n\u001b[1;32m      6\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mtranscribe\u001b[39m(audio):\n\u001b[1;32m      7\u001b[0m     text \u001b[39m=\u001b[39m pipe(audio)[\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/__init__.py:726\u001b[0m, in \u001b[0;36mpipeline\u001b[0;34m(task, model, config, tokenizer, feature_extractor, image_processor, framework, revision, use_fast, use_auth_token, device, device_map, torch_dtype, trust_remote_code, model_kwargs, pipeline_class, **kwargs)\u001b[0m\n\u001b[1;32m    721\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, \u001b[39mstr\u001b[39m):\n\u001b[1;32m    722\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m    723\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mInferring the task automatically requires to check the hub with a model_id defined as a `str`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    724\u001b[0m             \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mmodel\u001b[39m}\u001b[39;00m\u001b[39m is not a valid model_id.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    725\u001b[0m         )\n\u001b[0;32m--> 726\u001b[0m     task \u001b[39m=\u001b[39m get_task(model, use_auth_token)\n\u001b[1;32m    728\u001b[0m \u001b[39m# Retrieve the task\u001b[39;00m\n\u001b[1;32m    729\u001b[0m \u001b[39mif\u001b[39;00m task \u001b[39min\u001b[39;00m custom_tasks:\n",
+      "File \u001b[0;32m~/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/__init__.py:434\u001b[0m, in \u001b[0;36mget_task\u001b[0;34m(model, use_auth_token)\u001b[0m\n\u001b[1;32m    432\u001b[0m     info \u001b[39m=\u001b[39m model_info(model, token\u001b[39m=\u001b[39muse_auth_token)\n\u001b[1;32m    433\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m--> 434\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mInstantiating a pipeline without a task set raised an error: \u001b[39m\u001b[39m{\u001b[39;00me\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    435\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m info\u001b[39m.\u001b[39mpipeline_tag:\n\u001b[1;32m    436\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m    437\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThe model \u001b[39m\u001b[39m{\u001b[39;00mmodel\u001b[39m}\u001b[39;00m\u001b[39m does not seem to have a correct `pipeline_tag` set to infer the task automatically\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    438\u001b[0m     )\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Instantiating a pipeline without a task set raised an error: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './vi_whisper-small'."
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    "import gradio as gr\n",
+    "\n",
+    "pipe = pipeline(model=\"./vi_whisper-small\")  \n",
+    "\n",
+    "def transcribe(audio):\n",
+    "    text = pipe(audio)[\"text\"]\n",
+    "    return text\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=transcribe,\n",
+    "    inputs=gr.Audio(source=\"upload\", type=\"filepath\"),\n",
+    "    outputs=\"text\",\n",
+    "    title=\"Whisper Base Vietnamese\",\n",
+    "    description=\"Realtime demo for Vietnamese speech recognition using a fine-tuned Whisper base model.\",\n",
+    ")\n",
+    "\n",
+    "iface.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DUY",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/training.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from huggingface_hub import interpreter_login
+from datasets import load_dataset, DatasetDict, load_from_disk
+from transformers import WhisperProcessor
+from transformers import WhisperForConditionalGeneration
+from transformers import Seq2SeqTrainingArguments
+from transformers import Seq2SeqTrainer
+from transformers import EarlyStoppingCallback
+from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from peft import prepare_model_for_int8_training
+from peft import PeftModel, LoraModel, LoraConfig, get_peft_model
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+import evaluate
+import os
+class SavePeftModelCallback(TrainerCallback):
+    def on_save(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
+        kwargs["model"].save_pretrained(peft_model_path)
+        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
+        if os.path.exists(pytorch_model_path):
+            os.remove(pytorch_model_path)
+        return control
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"]} for feature in features]
+        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # ******************This is only in the case of augmented data ***************** Remove if not
+        batch["attention_mask"] = torch.LongTensor([feature["attention_mask"] for feature in features])
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+         # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        batch["labels"] = labels
+        return batch
+def compute_metrics(pred):
+    pred_ids = pred.predictions
+    label_ids = pred.label_ids
+    # replace -100 with the pad_token_id
+    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
+    # we do not want to group tokens when computing the metrics
+    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+    return {"wer": wer}
+if __name__ == "__main__":
+    early_stopping_callback = EarlyStoppingCallback(
+        early_stopping_patience=3,  # Stop training if the metric doesn't improve for 3 evaluations
+        early_stopping_threshold=0.0005,  # Minimum change in the metric to be considered an improvement
+    )
+    # Load Dataset
+    processed_dataset = DatasetDict()
+    processed_dataset = load_from_disk("./vin_clean")
+    print(processed_dataset)
+    # load processor
+    processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Vietnamese", task="transcribe")
+    # intialize data collator
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+    # download metric
+    metric = evaluate.load("wer")
+    # Download model in 8bit
+    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium", load_in_8bit=True, device_map="auto")
+    model.config.forced_decoder_ids = None
+    model.config.suppress_tokens = []
+    # preparing model with PEFT
+    model = prepare_model_for_int8_training(model, output_imbedding_layer="proj_out")
+    config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
+    model = get_peft_model(model, config)
+    model.print_trainable_parameters()
+    # Define trainnig arguments
+    training_args = Seq2SeqTrainingArguments(
+    output_dir="./whisper-medium-Lora",  # change to a repo name of your choice
+    per_device_train_batch_size=32,
+    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
+    learning_rate=5e-5,
+    warmup_steps=500,
+    max_steps=10000,
+    evaluation_strategy="steps",
+    gradient_checkpointing=True,
+    optim="adamw_torch",
+    fp16=True,
+    per_device_eval_batch_size=8,
+    generation_max_length=225,
+    save_steps=2000,
+    eval_steps=500,
+    logging_steps=25,
+    report_to=["tensorboard"],
+    predict_with_generate=True,
+    # load_best_model_at_end=True,
+    metric_for_best_model="wer",
+    greater_is_better=False,
+    # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
+    remove_unused_columns=False,
+    label_names=["labels"],  # same reason as above
+    push_to_hub=False,
+)
+    # initialize trainer
+    trainer = Seq2SeqTrainer(
+        args=training_args,
+        model=model,
+        train_dataset=processed_dataset["train"],
+        eval_dataset=processed_dataset["test"],
+        data_collator=data_collator,
+        tokenizer=processor.feature_extractor,
+        callbacks=[early_stopping_callback, SavePeftModelCallback],
+    )
+    # start training
+    trainer.train()
+    # set up args and push to hub
+    kwargs = {
+        "dataset": "vin100h",
+        "language": "vi",
+        "model_name": "Whisper Medium LoRA - Clean Data",
+        "finetuned_from": "openai/whisper-medium",
+        "tasks": "automatic-speech-recognition",
+    }
+    model.push_to_hub(**kwargs)

src/vin_whisper_medium.ipynb ADDED Viewed

	@@ -0,0 +1,1164 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/whisper/__init__.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "import whisper\n",
+    "print(whisper.__file__)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "2\n",
+      "0\n",
+      "<torch.cuda.device object at 0x7f69e1e31eb0>\n",
+      "Tesla T4\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "print(torch.cuda.is_available())\n",
+    "\n",
+    "\n",
+    "print(torch.cuda.device_count())\n",
+    "\n",
+    "\n",
+    "print(torch.cuda.current_device())\n",
+    "print(torch.cuda.device(0))\n",
+    "\n",
+    "print(torch.cuda.get_device_name(0))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f290d4efc37a4112a662c062e621e482",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name_or_path = \"openai/whisper-medium\"\n",
+    "task = \"transcribe\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = \"Vin100h_MITI_private\"\n",
+    "language = \"Vietnamese\"\n",
+    "language_abbr = \"vi\" # Short hand code for the language we want to fine-tune"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+      "        num_rows: 1679\n",
+      "    })\n",
+      "    test: Dataset({\n",
+      "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+      "        num_rows: 420\n",
+      "    })\n",
+      "})\n",
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+      "        num_rows: 6735\n",
+      "    })\n",
+      "    test: Dataset({\n",
+      "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+      "        num_rows: 1688\n",
+      "    })\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "    # Load Dataset\n",
+    "from datasets import load_dataset, DatasetDict, load_from_disk\n",
+    "processed_dataset = DatasetDict()\n",
+    "processed_dataset = load_from_disk(\"./MITI_clean\")\n",
+    "processed_dataset2 = load_from_disk(\"./vin_10h/\")\n",
+    "\n",
+    "print(processed_dataset)\n",
+    "print(processed_dataset2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "# Assuming you have already loaded your dataset\n",
+    "# processed_dataset2 = ...\n",
+    "\n",
+    "# Randomly select 5000 indices from the train dataset\n",
+    "import random\n",
+    "num_samples_train = 5000\n",
+    "num_samples_test = 600\n",
+    "random_indices_train = random.sample(range(len(processed_dataset2['train'])), num_samples_train)\n",
+    "random_indices_test = random.sample(range(len(processed_dataset2['test'])), num_samples_test)\n",
+    "\n",
+    "# Initialize lists for train dataset\n",
+    "input_features_train = []\n",
+    "input_length_train = []\n",
+    "attention_mask_train = []\n",
+    "labels_train = []\n",
+    "\n",
+    "# Initialize lists for test dataset\n",
+    "input_features_test = []\n",
+    "input_length_test = []\n",
+    "attention_mask_test = []\n",
+    "labels_test = []\n",
+    "\n",
+    "# Populate lists for train dataset\n",
+    "for i in random_indices_train:\n",
+    "    input_features_train.append(processed_dataset2['train'][i]['input_features'])\n",
+    "    input_length_train.append(processed_dataset2['train'][i]['input_length'])\n",
+    "    attention_mask_train.append(processed_dataset2['train'][i]['attention_mask'])\n",
+    "    labels_train.append(processed_dataset2['train'][i]['labels'])\n",
+    "\n",
+    "# Populate lists for test dataset\n",
+    "for i in random_indices_test:\n",
+    "    input_features_test.append(processed_dataset2['test'][i]['input_features'])\n",
+    "    input_length_test.append(processed_dataset2['test'][i]['input_length'])\n",
+    "    attention_mask_test.append(processed_dataset2['test'][i]['attention_mask'])\n",
+    "    labels_test.append(processed_dataset2['test'][i]['labels'])\n",
+    "\n",
+    "# Create a new dataset with the randomly selected rows\n",
+    "random_subset = Dataset.from_dict({\n",
+    "    'train': {\n",
+    "        'input_features': input_features_train,\n",
+    "        'input_length': input_length_train,\n",
+    "        'attention_mask': attention_mask_train,\n",
+    "        'labels': labels_train,\n",
+    "    },\n",
+    "    'test': {\n",
+    "        'input_features': input_features_test,\n",
+    "        'input_length': input_length_test,\n",
+    "        'attention_mask': attention_mask_test,\n",
+    "        'labels': labels_test,\n",
+    "    }\n",
+    "})\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "concat = DatasetDict()\n",
+    "concat[\"train\"] = datasets.concatenate_datasets([processed_dataset[\"train\"], processed_dataset2[\"train\"]])\n",
+    "concat['test']= datasets.concatenate_datasets([processed_dataset[\"test\"], processed_dataset2[\"test\"]])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+       "        num_rows: 8414\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n",
+       "        num_rows: 2108\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "concat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperFeatureExtractor\n",
+    "\n",
+    "feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from transformers import WhisperTokenizer\n",
+    "\n",
+    "tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('./Viet_ASR/tokenizer_config.json',\n",
+       " './Viet_ASR/special_tokens_map.json',\n",
+       " './Viet_ASR/vocab.json',\n",
+       " './Viet_ASR/merges.txt',\n",
+       " './Viet_ASR/normalizer.json',\n",
+       " './Viet_ASR/added_tokens.json')"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.save_pretrained('./Viet_ASR')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "from dataclasses import dataclass\n",
+    "from typing import Any, Dict, List, Union\n",
+    "from transformers import WhisperProcessor\n",
+    "\n",
+    "processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)\n",
+    "@dataclass\n",
+    "class DataCollatorSpeechSeq2SeqWithPadding:\n",
+    "    processor: Any\n",
+    "\n",
+    "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
+    "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
+    "        # first treat the audio inputs by simply returning torch tensors\n",
+    "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
+    "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
+    "\n",
+    "        # get the tokenized label sequences\n",
+    "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
+    "\n",
+    "\n",
+    "        # pad the labels to max length\n",
+    "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
+    "\n",
+    "         # replace padding with -100 to ignore loss correctly\n",
+    "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
+    "\n",
+    "        # if bos token is appended in previous tokenization step,\n",
+    "        # cut bos token here as it's append later anyways\n",
+    "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
+    "            labels = labels[:, 1:]\n",
+    "\n",
+    "        batch[\"labels\"] = labels\n",
+    "\n",
+    "        return batch\n",
+    "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate\n",
+    "\n",
+    "metric = evaluate.load(\"wer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperForConditionalGeneration\n",
+    "\n",
+    "model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-medium', load_in_8bit=True, device_map=\"auto\" )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.config.forced_decoder_ids = None\n",
+    "model.config.suppress_tokens = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch.utils.hooks.RemovableHandle at 0x7f1a9445da60>"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from peft import prepare_model_for_kbit_training\n",
+    "\n",
+    "model = prepare_model_for_kbit_training(model)\n",
+    "def make_inputs_require_grad(module, input, output):\n",
+    "    output.requires_grad_(True)\n",
+    "\n",
+    "model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 9,437,184 || all params: 773,295,104 || trainable%: 1.2203858463844612\n"
+     ]
+    }
+   ],
+   "source": [
+    "from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model\n",
+    "#target_modules = [\"k_proj\", \"q_proj\", \"v_proj\", \"out_proj\", \"fc1\", \"fc2\"] #will it better ?\n",
+    "target_modules=[\"q_proj\", \"v_proj\"]\n",
+    "config = LoraConfig(r=32, lora_alpha=64, target_modules=target_modules, lora_dropout=0.05, bias=\"none\")\n",
+    "\n",
+    "model = get_peft_model(model, config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Seq2SeqTrainingArguments\n",
+    "\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./Vietnamese_ASR\", \n",
+    "    per_device_train_batch_size=10,\n",
+    "    #auto_find_batch_size  = True,\n",
+    "    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size\n",
+    "    learning_rate=5e-5,\n",
+    "    warmup_steps=50,\n",
+    "    num_train_epochs=3,\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    gradient_checkpointing=True,\n",
+    "    optim=\"adamw_torch\",\n",
+    "    fp16=True,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    generation_max_length=225,\n",
+    "    logging_steps=100,\n",
+    "    report_to=[\"tensorboard\"],\n",
+    "    predict_with_generate=True,\n",
+    "    # load_best_model_at_end=True,\n",
+    "    greater_is_better=False,\n",
+    "    save_strategy = \"epoch\",\n",
+    "    # required as the PeftModel forward doesn't have the signature of the wrapped model's forward\n",
+    "    remove_unused_columns=False,\n",
+    "    label_names=[\"labels\"],  # same reason as above\n",
+    "    push_to_hub=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/tesla/New Volume/DEMO/DUY/Vietnamese_ASR/./Vietnamese_ASR is already a clone of https://huggingface.co/DuyTa/Vietnamese_ASR. Make sure you pull the latest changes with `repo.git_pull()`.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl\n",
+    "from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR\n",
+    "\n",
+    "\n",
+    "class SavePeftModelCallback(TrainerCallback):\n",
+    "    def on_save(\n",
+    "        self,\n",
+    "        args: TrainingArguments,\n",
+    "        state: TrainerState,\n",
+    "        control: TrainerControl,\n",
+    "        **kwargs,\n",
+    "    ):\n",
+    "        checkpoint_folder = os.path.join(args.output_dir, f\"{PREFIX_CHECKPOINT_DIR}-{state.global_step}\")\n",
+    "\n",
+    "        peft_model_path = os.path.join(checkpoint_folder, \"adapter_model\")\n",
+    "        kwargs[\"model\"].save_pretrained(peft_model_path)\n",
+    "\n",
+    "        pytorch_model_path = os.path.join(checkpoint_folder, \"pytorch_model.bin\")\n",
+    "        if os.path.exists(pytorch_model_path):\n",
+    "            os.remove(pytorch_model_path)\n",
+    "        return control\n",
+    "\n",
+    "\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    args=training_args,\n",
+    "    model=model,\n",
+    "    train_dataset=concat[\"train\"],\n",
+    "    eval_dataset=concat[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    "    # compute_metrics=compute_metrics,\n",
+    "    tokenizer=processor.feature_extractor,\n",
+    "    callbacks=[SavePeftModelCallback],\n",
+    ")\n",
+    "model.config.use_cache = False  # silence the warnings. Please re-enable for inference!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0aa0180f6e64eaa8951a4c940aa518f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1263 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 1.9814, 'learning_rate': 4.814509480626546e-05, 'epoch': 0.24}\n",
+      "{'loss': 0.6861, 'learning_rate': 4.402308326463314e-05, 'epoch': 0.48}\n",
+      "{'loss': 0.3736, 'learning_rate': 3.9901071723000826e-05, 'epoch': 0.71}\n",
+      "{'loss': 0.332, 'learning_rate': 3.577906018136851e-05, 'epoch': 0.95}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e9a8f06d39e448a9523d9a29699cadc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/264 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.3133259117603302, 'eval_runtime': 887.0949, 'eval_samples_per_second': 2.376, 'eval_steps_per_second': 0.298, 'epoch': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.3005, 'learning_rate': 3.165704863973619e-05, 'epoch': 1.19}\n",
+      "{'loss': 0.307, 'learning_rate': 2.753503709810388e-05, 'epoch': 1.43}\n",
+      "{'loss': 0.2838, 'learning_rate': 2.341302555647156e-05, 'epoch': 1.66}\n",
+      "{'loss': 0.2746, 'learning_rate': 1.9291014014839242e-05, 'epoch': 1.9}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e65ecdbc96246b8b4721505b4252a8a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/264 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.28433552384376526, 'eval_runtime': 880.1965, 'eval_samples_per_second': 2.395, 'eval_steps_per_second': 0.3, 'epoch': 2.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.2857, 'learning_rate': 1.5169002473206925e-05, 'epoch': 2.14}\n",
+      "{'loss': 0.2643, 'learning_rate': 1.104699093157461e-05, 'epoch': 2.38}\n",
+      "{'loss': 0.2604, 'learning_rate': 6.924979389942292e-06, 'epoch': 2.61}\n",
+      "{'loss': 0.2505, 'learning_rate': 2.8029678483099755e-06, 'epoch': 2.85}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a4f479ef36f4f00b3c591503b411e5f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/264 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'eval_loss': 0.27759623527526855, 'eval_runtime': 879.7333, 'eval_samples_per_second': 2.396, 'eval_steps_per_second': 0.3, 'epoch': 3.0}\n",
+      "{'train_runtime': 35575.7347, 'train_samples_per_second': 0.71, 'train_steps_per_second': 0.036, 'train_loss': 0.4555940831925127, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=1263, training_loss=0.4555940831925127, metrics={'train_runtime': 35575.7347, 'train_samples_per_second': 0.71, 'train_steps_per_second': 0.036, 'train_loss': 0.4555940831925127, 'epoch': 3.0})"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DuyTa/Vietnamese_ASR\n"
+     ]
+    }
+   ],
+   "source": [
+    "peft_model_id = \"DuyTa/Vietnamese_ASR\"\n",
+    "model.push_to_hub(peft_model_id)\n",
+    "print(peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import PeftModel, PeftConfig\n",
+    "from transformers import WhisperForConditionalGeneration\n",
+    "peft_model_id = \"./Vietnamese_ASR\"\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\n",
+    "    peft_config.base_model_name_or_path, load_in_8bit=True, device_map=\"auto\"\n",
+    ")\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/88 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "100%|██████████| 88/88 [59:07<00:00, 40.31s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wer=15.57082617523036\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import gc\n",
+    "\n",
+    "eval_dataloader = DataLoader(concat[\"test\"], batch_size=24, collate_fn=data_collator)\n",
+    "\n",
+    "model.eval()\n",
+    "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "    with torch.cuda.amp.autocast():\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens = (\n",
+    "                model.generate(\n",
+    "                    input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+    "                    decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n",
+    "                    max_new_tokens=255,\n",
+    "                )\n",
+    "                .cpu()\n",
+    "                .numpy()\n",
+    "            )\n",
+    "            labels = batch[\"labels\"].cpu().numpy()\n",
+    "            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+    "            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+    "            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "            metric.add_batch(\n",
+    "                predictions=decoded_preds,\n",
+    "                references=decoded_labels,\n",
+    "            )\n",
+    "    del generated_tokens, labels, batch\n",
+    "    gc.collect()\n",
+    "wer = 100 * metric.compute()\n",
+    "print(f\"{wer=}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# using Vietnamese text normalization after take whisper out token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 88/88 [58:18<00:00, 39.75s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "normalized_wer=14.601364195460137\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import gc\n",
+    "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
+    "normalizer = BasicTextNormalizer()\n",
+    "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)\n",
+    "\n",
+    "model.eval()\n",
+    "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "    with torch.cuda.amp.autocast():\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens= model.generate(input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+    "                                             forced_decoder_ids=forced_decoder_ids,\n",
+    "                                             max_new_tokens=255).cpu().numpy()\n",
+    "            labels = batch[\"labels\"].cpu().numpy()\n",
+    "            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n",
+    "            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+    "            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "            metric.add_batch(\n",
+    "                predictions=[normalizer(pred).strip() for pred in decoded_preds],\n",
+    "                references=[normalizer(label).strip() for label in decoded_labels],\n",
+    "            )\n",
+    "    # if step==0:\n",
+    "    #     break\n",
+    "    del generated_tokens, labels, batch\n",
+    "    gc.collect()\n",
+    "normalized_wer = 100 * metric.compute()\n",
+    "print(f\"{normalized_wer=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pred='toàn bộ phi hành đoàn đã bị giết chết khiến con tàu quân sự đậm thẳng vào ishimura', label='toàn bộ phi hành đoàn đã bị giết chết khiến con tàu quân sự đâm thẳng vào ishimura'\n",
+      "pred='đủ kinh nghiệm để mình quản lý nhân viên và mình làm sao để mình đưa ra một cái dịch vụ tốt nhất', label='đủ kinh nghiệm để mình quản lý nhân viên và mình làm sao để mình đưa ra một cái dịch vụ tốt nhất'\n",
+      "pred='nói một trong một cái chương trình trong tương lai về ngành thu y thì ở mỹ tất cả các đại học nào lớn đều có ngành thu y hết', label='<unk> nói một trong một cái chương trình trong tương lai về ngành thú y thì ở mỹ tất cả các đại học nào lớn đều có ngành thú y hết'\n",
+      "pred='phấn đấu đến năm hai ngàn không trăm hai mười có từ tám trăm đến một ngàn kinh nghiệp tham gia sàn sâu dịch thu mại điện tử của bộ công thương và quốc tế năm mươi phần trăm số này vô bắn trên sàn', label='phấn đấu đến năm hai ngàn không trăm hai mươi có từ tám trăm đến một ngàn doanh nghiệp tham gia sàn giao dịch thương mại điện tử của bộ công thương và quốc tế năm mươi phần trăm số này luôn bán trên sàn'\n",
+      "pred='còn trách nhiệm kiểm tra thanh tra là của ủy ban nhân dân các cấp', label='còn trách nhiệm kiểm tra thanh tra là của ủy ban nhân dân các cấp'\n",
+      "pred='vậy mà cậu im lặng khóa trái tìm mình chắc địa dành cho cái gì đó vĩ đại hơn chăng', label='vậy mà cậu im lặng khóa trái tim mình chắc để giành cho cái gì đó vĩ đại hơn chăng'\n",
+      "pred='khi nộp phiếu trả lời trắc nghiệm thí sinh phải ghi tên và danh sách thí sinh nộp bài', label='khi nộp phiếu trả lời trắc nghiệm thí sinh phải ký tên vào danh sách thí sinh nộp bài'\n",
+      "pred='khi nghĩ rằng mình đã khỏi ai ngờ ung thư lại tái phát và tôi đã lắng nghe câu chuyện của tất cả mọi người', label='khi nghĩ rằng mình đã khỏi ai ngờ ung thư lại tái phát và tôi đã lắng nghe câu chuyện của tất cả mọi người'\n",
+      "pred='người cùng ấp là trương thật từng muốn kết giao với giám vì ông từ chối', label='người cùng ấp là trương thực từng muốn kết giao với giám bị ông từ chối'\n",
+      "pred='bài thơ với những dòng thơ rất xúc động như sau', label='bài thơ với những dòng thơ rất xúc động như sau'\n",
+      "pred='công bố chỉ số niềm tin kinh doanh của doanh nghiệp', label='công bố chỉ s�� niềm tin kinh doanh của doanh nghiệp'\n",
+      "pred='khi quanh hồ tổng tới thăng lông đúng lúc tô trung tự đang đánh nhau to với đồ quảng', label='khi quân hộ tống tới thăng long đúng lúc tô trung từ đang đánh nhau to với đỗ quảng'\n",
+      "pred='chứ không lẽ bây giờ kêu men trai', label='chứ hổng lẽ bây giờ kêu mê trai'\n",
+      "pred='trong thời gian đó anh ấy hãy tâm sự với tôi', label='trong thời gian đó anh ấy hay tâm sự với tôi'\n",
+      "pred='mi mo sa lại cho màu sắc lá đẹp không cần dùng đến màu nhuộng hoàng sực dỡ từ duy nhẹ bé đó đã giúp vườn mi mo sa của bà nhanh chóng đem lại lợi nhuộn', label='mi mo sa lại cho màu sắc lá đẹp không cần dùng đến màu nhuộm hoa rực rỡ tư duy nhạy bén đó đã giúp vườn mi mo sa của bà nhanh chóng đem lại lợi nhuận'\n",
+      "pred='chơi tìm kiếm tài năng thiên đỉnh god thai lần thế các táo đâu hết cả rồi', label='chơi tìm kiếm tài năng thiên đình gót thai lừn thế các táo đâu hết cả rồi'\n",
+      "pred='dù đức và pháp bất đồng sâu sắc nhưng chính kiến của họ thì đều sai', label='dù đức và pháp bất đồng sâu sắc nhưng chính kiến của họ thì đều sai'\n",
+      "pred='đại ca bảo không hình anh ra mà ngồi anh đánh đi', label='đại ca bảo buông anh ra mà thôi anh'\n",
+      "pred='khi mà mang thai bác thị cũng cảnh báo rồi', label='khi mà mang thai thì bác sĩ cũng cảnh báo rồi'\n",
+      "pred='là tăng giảm thất thường và đột xuất kéo dài', label='mà tăng giảm thất thường và đột xuất kéo dài'\n"
+     ]
+    }
+   ],
+   "source": [
+    "for pred,label in zip(decoded_preds,decoded_labels):\n",
+    "  print(f\"{pred=}, {label=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import (\n",
+    "    AutomaticSpeechRecognitionPipeline,\n",
+    "    WhisperForConditionalGeneration,\n",
+    "    WhisperTokenizer,\n",
+    "    WhisperProcessor,\n",
+    ")\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "\n",
+    "peft_model_id = \"./Vietnamese_ASR\"\n",
+    "language = \"Vietnamese\"\n",
+    "task = \"transcribe\"\n",
+    "\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\n",
+    "    peft_config.base_model_name_or_path\n",
+    ")\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)\n",
+    "merged_model = model.merge_and_unload()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_model.save_pretrained(\"./Vietnamese_ASR/merged\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperTokenizer\n",
+    "\n",
+    "tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-medium', language=language, task=task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('./Vietnamese_ASR/merged/tokenizer_config.json',\n",
+       " './Vietnamese_ASR/merged/special_tokens_map.json',\n",
+       " './Vietnamese_ASR/merged/vocab.json',\n",
+       " './Vietnamese_ASR/merged/merges.txt',\n",
+       " './Vietnamese_ASR/merged/normalizer.json',\n",
+       " './Vietnamese_ASR/merged/added_tokens.json')"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.save_pretrained('./Vietnamese_ASR/merged')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/bin/bash: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ct2-transformers-converter --model ./Vietnamese_ASR/merged --output_dir ./Vietnamese_ASR/ct2ranslate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/bin/bash: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ct2-transformers-converter --model ./Vietnamese_ASR/merged --output_dir ./Vietnamese_ASR/ct2ranslate/quantized --quantization float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/tesla/New Volume/DEMO/DUY/Vietnamese_ASR/Vietnamese_ASR/src/Vietnamese_ASR is already a clone of https://huggingface.co/DuyTa/Vietnamese_ASR. Make sure you pull the latest changes with `repo.git_pull()`.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import Repository\n",
+    "repo = Repository(local_dir=\"\", clone_from='DuyTa/Vietnamese_ASR')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "061e5ea903e04d2e95bc3ff8a8de434b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Clean file runs/Aug17_22-42-43_tesla-T4/events.out.tfevents.1692289257.tesla-T4.201346.0:  14%|#4        | 1.0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "repo.git_pull(rebase=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repo.git_add(\".\")\n",
+    "repo.git_commit(commit_message=\"3 epochs finetuning and quantized model )\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Several commits (3) will be pushed upstream.\n",
+      "The progress bars may be unreliable.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "To https://huggingface.co/DuyTa/Vietnamese_ASR\n",
+      "   63bacc4..82e8e84  main -> main\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/DuyTa/Vietnamese_ASR/commit/82e8e84fe4f1ffee17eff82c39a163f4b81335d5'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "repo.git_push()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_model.push_to_hub(\"DuyTa/MITI_Whisper\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import gradio as gr\n",
+    "from transformers import (\n",
+    "    AutomaticSpeechRecognitionPipeline,\n",
+    "    WhisperForConditionalGeneration,\n",
+    "    WhisperTokenizer,\n",
+    "    WhisperProcessor,\n",
+    ")\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "\n",
+    "\n",
+    "peft_model_id = \"DuyTa/MITI_Whisper\"\n",
+    "language = \"Vietnamese\"\n",
+    "task = \"transcribe\"\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\n",
+    "    peft_config.base_model_name_or_path, load_in_8bit=True, device_map=\"auto\"\n",
+    ")\n",
+    "\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)\n",
+    "tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)\n",
+    "processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)\n",
+    "feature_extractor = processor.feature_extractor\n",
+    "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)\n",
+    "pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)\n",
+    "\n",
+    "\n",
+    "def transcribe(audio):\n",
+    "    with torch.cuda.amp.autocast():\n",
+    "        text = pipe(audio, generate_kwargs={\"forced_decoder_ids\": forced_decoder_ids}, max_new_tokens=255)[\"text\"]\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=transcribe,\n",
+    "    inputs=gr.Audio(source=\"upload\", type=\"filepath\"),\n",
+    "    outputs=\"text\",\n",
+    "    title=\"PEFT LoRA\",\n",
+    "    description=\"Realtime demo for Vietnamese speech recognition using `PEFT-LoRA+INT8` fine-tuned Whisper Medium .\",\n",
+    ")\n",
+    "\n",
+    "iface.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DUY",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/whisperX.ipynb ADDED Viewed

	@@ -0,0 +1,131 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
+      "Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.\n",
+      "CPU times: user 826 ms, sys: 96.7 ms, total: 923 ms\n",
+      "Wall time: 831 ms\n",
+      "[{'text': 'đó là ước vọng của nguyễn ái quốc từ những năm hai mươi của thế kỷ trước về một nhà nước việt nam độc lập dân chủ', 'start': 0.008, 'end': 6.556}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import whisperx\n",
+    "import gc \n",
+    "\n",
+    "device = \"cuda\" \n",
+    "audio_file = \"6.wav\"\n",
+    "batch_size = 16 \n",
+    "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
+    "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
+    "# 1. Transcribe with original whisper (batched)\n",
+    "model = whisperx.load_model(model_path, device, compute_type=compute_type,language='vi')\n",
+    "\n",
+    "audio = whisperx.load_audio(audio_file)\n",
+    "%time result = model.transcribe(audio, batch_size=batch_size)\n",
+    "print(result[\"segments\"]) # before alignment\n",
+    "\n",
+    "# delete model if low on GPU resources\n",
+    "# import gc; gc.collect(); torch.cuda.empty_cache(); del model\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gc; gc.collect()\n",
+    "import torch\n",
+    "torch.cuda.empty_cache(); del model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'start': 0.008, 'end': 2.396, 'text': 'với một người đi làm thuê như anh số tiền kiếm được chưa đủ để thoả mãn nhu cầu cá nhân nói gì đến chăm lo cho gia đình', 'words': [{'word': 'với', 'start': 0.008, 'end': 0.068, 'score': 0.01}, {'word': 'một', 'start': 0.088, 'end': 0.148, 'score': 0.01}, {'word': 'người', 'start': 0.169, 'end': 0.269, 'score': 0.011}, {'word': 'đi', 'start': 0.289, 'end': 0.329, 'score': 0.01}, {'word': 'làm', 'start': 0.349, 'end': 0.409, 'score': 0.011}, {'word': 'thuê', 'start': 0.429, 'end': 0.51, 'score': 0.01}, {'word': 'như', 'start': 0.53, 'end': 0.59, 'score': 0.012}, {'word': 'anh', 'start': 0.61, 'end': 0.67, 'score': 0.01}, {'word': 'số', 'start': 0.69, 'end': 0.73, 'score': 0.01}, {'word': 'tiền', 'start': 0.75, 'end': 0.831, 'score': 0.01}, {'word': 'kiếm', 'start': 0.851, 'end': 0.931, 'score': 0.01}, {'word': 'được', 'start': 0.951, 'end': 1.031, 'score': 0.01}, {'word': 'chưa', 'start': 1.051, 'end': 1.132, 'score': 0.01}, {'word': 'đủ', 'start': 1.152, 'end': 1.192, 'score': 0.01}, {'word': 'để', 'start': 1.212, 'end': 1.252, 'score': 0.01}, {'word': 'thoả', 'start': 1.272, 'end': 1.353, 'score': 0.01}, {'word': 'mãn', 'start': 1.373, 'end': 1.433, 'score': 0.011}, {'word': 'nhu', 'start': 1.453, 'end': 1.513, 'score': 0.011}, {'word': 'cầu', 'start': 1.533, 'end': 1.593, 'score': 0.011}, {'word': 'cá', 'start': 1.613, 'end': 1.654, 'score': 0.01}, {'word': 'nhân', 'start': 1.674, 'end': 1.754, 'score': 0.011}, {'word': 'nói', 'start': 1.774, 'end': 1.834, 'score': 0.01}, {'word': 'gì', 'start': 1.854, 'end': 1.894, 'score': 0.011}, {'word': 'đến', 'start': 1.914, 'end': 1.975, 'score': 0.01}, {'word': 'chăm', 'start': 1.995, 'end': 2.075, 'score': 0.011}, {'word': 'lo', 'start': 2.095, 'end': 2.135, 'score': 0.009}, {'word': 'cho', 'start': 2.155, 'end': 2.215, 'score': 0.011}, {'word': 'gia', 'start': 2.235, 'end': 2.296, 'score': 0.01}, {'word': 'đình', 'start': 2.316, 'end': 2.396, 'score': 0.011}]}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 2. Align whisper output\n",
+    "device = \"cuda\" \n",
+    "audio_file = \"audio.wav\"\n",
+    "batch_size = 16 \n",
+    "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
+    "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
+    "model_a, metadata = whisperx.load_align_model(language_code=\"vi\" ,device=device)\n",
+    "result = whisperx.align(result[\"segments\"], model_a, metadata, audio, device, return_char_alignments=False)\n",
+    "\n",
+    "print(result[\"segments\"]) # after alignment\n",
+    "\n",
+    "# delete model if low on GPU resources\n",
+    "import gc; gc.collect(); torch.cuda.empty_cache(); del model_a\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. Assign speaker labels\n",
+    "diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)\n",
+    "\n",
+    "# add min/max number of speakers if known\n",
+    "diarize_segments = diarize_model(audio)\n",
+    "# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)\n",
+    "\n",
+    "result = whisperx.assign_word_speakers(diarize_segments, result)\n",
+    "print(diarize_segments)\n",
+    "print(result[\"segments\"]) # segments are now assigned speaker IDs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DUY",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.17"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/whisper_quant.py ADDED Viewed

	@@ -0,0 +1,995 @@

+import itertools
+import logging
+import os
+import zlib
+from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
+import ctranslate2
+import numpy as np
+import tokenizers
+from faster_whisper.audio import decode_audio
+from faster_whisper.feature_extractor import FeatureExtractor
+from faster_whisper.tokenizer import Tokenizer
+from download_quantized import download_model, format_timestamp, get_logger
+from faster_whisper.vad import (
+    SpeechTimestampsMap,
+    VadOptions,
+    collect_chunks,
+    get_speech_timestamps,
+)
+class Word(NamedTuple):
+    start: float
+    end: float
+    word: str
+    probability: float
+class Segment(NamedTuple):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: List[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: Optional[List[Word]]
+class TranscriptionOptions(NamedTuple):
+    beam_size: int
+    best_of: int
+    patience: float
+    length_penalty: float
+    repetition_penalty: float
+    log_prob_threshold: Optional[float]
+    no_speech_threshold: Optional[float]
+    compression_ratio_threshold: Optional[float]
+    condition_on_previous_text: bool
+    prompt_reset_on_temperature: float
+    temperatures: List[float]
+    initial_prompt: Optional[Union[str, Iterable[int]]]
+    prefix: Optional[str]
+    suppress_blank: bool
+    suppress_tokens: Optional[List[int]]
+    without_timestamps: bool
+    max_initial_timestamp: float
+    word_timestamps: bool
+    prepend_punctuations: str
+    append_punctuations: str
+class TranscriptionInfo(NamedTuple):
+    language: str
+    language_probability: float
+    duration: float
+    all_language_probs: Optional[List[Tuple[str, float]]]
+    transcription_options: TranscriptionOptions
+    vad_options: VadOptions
+class WhisperModel:
+    def __init__(
+        self,
+        model_size_or_path: str,
+        device: str = "auto",
+        device_index: Union[int, List[int]] = 0,
+        compute_type: str = "default",
+        cpu_threads: int = 0,
+        num_workers: int = 1,
+        download_root: Optional[str] = None,
+        local_files_only: bool = False,
+    ):
+        """Initializes the Whisper model.
+        Args:
+          model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
+            small, small.en, medium, medium.en, large-v1, or large-v2), a path to a converted
+            model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
+            When a size or a model ID is configured, the converted model is downloaded
+            from the Hugging Face Hub.
+          device: Device to use for computation ("cpu", "cuda", "auto").
+          device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
+          compute_type: Type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+          cpu_threads: Number of threads to use when running on CPU (4 by default).
+            A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
+          download_root: Directory where the models should be saved. If not set, the models
+            are saved in the standard Hugging Face cache directory.
+          local_files_only:  If True, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        """
+        self.logger = get_logger()
+        if os.path.isdir(model_size_or_path):
+            model_path = model_size_or_path
+        else:
+            model_path = download_model(
+                model_size_or_path,
+                local_files_only=local_files_only,
+                cache_dir=download_root,
+            )
+        self.model = ctranslate2.models.Whisper(
+            model_path,
+            device=device,
+            device_index=device_index,
+            compute_type=compute_type,
+            intra_threads=cpu_threads,
+            inter_threads=num_workers,
+        )
+        tokenizer_file = os.path.join(model_path, "tokenizer.json")
+        if os.path.isfile(tokenizer_file):
+            self.hf_tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file)
+        else:
+            self.hf_tokenizer = tokenizers.Tokenizer.from_pretrained(
+                "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
+            )
+        self.feature_extractor = FeatureExtractor()
+        self.num_samples_per_token = self.feature_extractor.hop_length * 2
+        self.frames_per_second = (
+            self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
+        )
+        self.tokens_per_second = (
+            self.feature_extractor.sampling_rate // self.num_samples_per_token
+        )
+        self.input_stride = 2
+        self.time_precision = 0.02
+        self.max_length = 448
+    def transcribe(
+        self,
+        audio: Union[str, BinaryIO, np.ndarray],
+        language: Optional[str] = None,
+        task: str = "transcribe",
+        beam_size: int = 5,
+        best_of: int = 5,
+        patience: float = 1,
+        length_penalty: float = 1,
+        repetition_penalty: float = 1,
+        temperature: Union[float, List[float], Tuple[float, ...]] = [
+            0.0,
+            0.2,
+            0.4,
+            0.6,
+            0.8,
+            1.0,
+        ],
+        compression_ratio_threshold: Optional[float] = 2.4,
+        log_prob_threshold: Optional[float] = -1.0,
+        no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        prompt_reset_on_temperature: float = 0.5,
+        initial_prompt: Optional[Union[str, Iterable[int]]] = None,
+        prefix: Optional[str] = None,
+        suppress_blank: bool = True,
+        suppress_tokens: Optional[List[int]] = [-1],
+        without_timestamps: bool = False,
+        max_initial_timestamp: float = 1.0,
+        word_timestamps: bool = False,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        vad_filter: bool = False,
+        vad_parameters: Optional[Union[dict, VadOptions]] = None,
+    ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
+        """Transcribes an input file.
+        Arguments:
+          audio: Path to the input file (or a file-like object), or the audio waveform.
+          language: The language spoken in the audio. It should be a language code such
+            as "en" or "fr". If not set, the language will be detected in the first 30 seconds
+            of audio.
+          task: Task to execute (transcribe or translate).
+          beam_size: Beam size to use for decoding.
+          best_of: Number of candidates when sampling with non-zero temperature.
+          patience: Beam search patience factor.
+          length_penalty: Exponential length penalty constant.
+          repetition_penalty: Penalty applied to the score of previously generated tokens
+            (set > 1 to penalize).
+          temperature: Temperature for sampling. It can be a tuple of temperatures,
+            which will be successively used upon failures according to either
+            `compression_ratio_threshold` or `log_prob_threshold`.
+          compression_ratio_threshold: If the gzip compression ratio is above this value,
+            treat as failed.
+          log_prob_threshold: If the average log probability over sampled tokens is
+            below this value, treat as failed.
+          no_speech_threshold: If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+          condition_on_previous_text: If True, the previous output of the model is provided
+            as a prompt for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a failure loop,
+            such as repetition looping or timestamps going out of sync.
+          prompt_reset_on_temperature: Resets prompt if temperature is above this value.
+            Arg has effect only if condition_on_previous_text is True.
+          initial_prompt: Optional text string or iterable of token ids to provide as a
+            prompt for the first window.
+          prefix: Optional text to provide as a prefix for the first window.
+          suppress_blank: Suppress blank outputs at the beginning of the sampling.
+          suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
+            of symbols as defined in the model config.json file.
+          without_timestamps: Only sample text tokens.
+          max_initial_timestamp: The initial timestamp cannot be later than this.
+          word_timestamps: Extract word-level timestamps using the cross-attention pattern
+            and dynamic time warping, and include the timestamps for each word in each segment.
+          prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the next word
+          append_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the previous word
+          vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
+            without speech. This step is using the Silero VAD model
+            https://github.com/snakers4/silero-vad.
+          vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
+            parameters and default values in the class `VadOptions`).
+        Returns:
+          A tuple with:
+            - a generator over transcribed segments
+            - an instance of TranscriptionInfo
+        """
+        sampling_rate = self.feature_extractor.sampling_rate
+        if not isinstance(audio, np.ndarray):
+            audio = decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        self.logger.info(
+            "Processing audio with duration %s", format_timestamp(duration)
+        )
+        if vad_filter:
+            if vad_parameters is None:
+                vad_parameters = VadOptions()
+            elif isinstance(vad_parameters, dict):
+                vad_parameters = VadOptions(**vad_parameters)
+            speech_chunks = get_speech_timestamps(audio, vad_parameters)
+            audio = collect_chunks(audio, speech_chunks)
+            self.logger.info(
+                "VAD filter removed %s of audio",
+                format_timestamp(duration - (audio.shape[0] / sampling_rate)),
+            )
+            if self.logger.isEnabledFor(logging.DEBUG):
+                self.logger.debug(
+                    "VAD filter kept the following audio segments: %s",
+                    ", ".join(
+                        "[%s -> %s]"
+                        % (
+                            format_timestamp(chunk["start"] / sampling_rate),
+                            format_timestamp(chunk["end"] / sampling_rate),
+                        )
+                        for chunk in speech_chunks
+                    ),
+                )
+        else:
+            speech_chunks = None
+        features = self.feature_extractor(audio)
+        encoder_output = None
+        all_language_probs = None
+        if language is None:
+            if not self.model.is_multilingual:
+                language = "en"
+                language_probability = 1
+            else:
+                segment = features[:, : self.feature_extractor.nb_max_frames]
+                encoder_output = self.encode(segment)
+                # results is a list of tuple[str, float] with language names and
+                # probabilities.
+                results = self.model.detect_language(encoder_output)[0]
+                # Parse language names to strip out markers
+                all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
+                # Get top language token and probability
+                language, language_probability = all_language_probs[0]
+                self.logger.info(
+                    "Detected language '%s' with probability %.2f",
+                    language,
+                    language_probability,
+                )
+        else:
+            language_probability = 1
+        tokenizer = Tokenizer(
+            self.hf_tokenizer,
+            self.model.is_multilingual,
+            task=task,
+            language=language,
+        )
+        options = TranscriptionOptions(
+            beam_size=beam_size,
+            best_of=best_of,
+            patience=patience,
+            length_penalty=length_penalty,
+            repetition_penalty=repetition_penalty,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            condition_on_previous_text=condition_on_previous_text,
+            prompt_reset_on_temperature=prompt_reset_on_temperature,
+            temperatures=(
+                temperature if isinstance(temperature, (list, tuple)) else [temperature]
+            ),
+            initial_prompt=initial_prompt,
+            prefix=prefix,
+            suppress_blank=suppress_blank,
+            suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens),
+            without_timestamps=without_timestamps,
+            max_initial_timestamp=max_initial_timestamp,
+            word_timestamps=word_timestamps,
+            prepend_punctuations=prepend_punctuations,
+            append_punctuations=append_punctuations,
+        )
+        segments = self.generate_segments(features, tokenizer, options, encoder_output)
+        if speech_chunks:
+            segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate)
+        info = TranscriptionInfo(
+            language=language,
+            language_probability=language_probability,
+            duration=duration,
+            transcription_options=options,
+            vad_options=vad_parameters,
+            all_language_probs=all_language_probs,
+        )
+        return segments, info
+    def generate_segments(
+        self,
+        features: np.ndarray,
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+        encoder_output: Optional[ctranslate2.StorageView] = None,
+    ) -> Iterable[Segment]:
+        content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
+        idx = 0
+        seek = 0
+        all_tokens = []
+        prompt_reset_since = 0
+        if options.initial_prompt is not None:
+            if isinstance(options.initial_prompt, str):
+                initial_prompt = " " + options.initial_prompt.strip()
+                initial_prompt_tokens = tokenizer.encode(initial_prompt)
+                all_tokens.extend(initial_prompt_tokens)
+            else:
+                all_tokens.extend(options.initial_prompt)
+        last_speech_timestamp = 0.0
+        while seek < content_frames:
+            time_offset = seek * self.feature_extractor.time_per_frame
+            segment = features[:, seek : seek + self.feature_extractor.nb_max_frames]
+            segment_size = min(
+                self.feature_extractor.nb_max_frames, content_frames - seek
+            )
+            segment_duration = segment_size * self.feature_extractor.time_per_frame
+            if self.logger.isEnabledFor(logging.DEBUG):
+                self.logger.debug(
+                    "Processing segment at %s", format_timestamp(time_offset)
+                )
+            previous_tokens = all_tokens[prompt_reset_since:]
+            prompt = self.get_prompt(
+                tokenizer,
+                previous_tokens,
+                without_timestamps=options.without_timestamps,
+                prefix=options.prefix if seek == 0 else None,
+            )
+            if encoder_output is None:
+                encoder_output = self.encode(segment)
+            (
+                result,
+                avg_logprob,
+                temperature,
+                compression_ratio,
+            ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options)
+            if options.no_speech_threshold is not None:
+                # no voice activity check
+                should_skip = result.no_speech_prob > options.no_speech_threshold
+                if (
+                    options.log_prob_threshold is not None
+                    and avg_logprob > options.log_prob_threshold
+                ):
+                    # don't skip if the logprob is high enough, despite the no_speech_prob
+                    should_skip = False
+                if should_skip:
+                    self.logger.debug(
+                        "No speech threshold is met (%f > %f)",
+                        result.no_speech_prob,
+                        options.no_speech_threshold,
+                    )
+                    # fast-forward to the next segment boundary
+                    seek += segment_size
+                    encoder_output = None
+                    continue
+            tokens = result.sequences_ids[0]
+            previous_seek = seek
+            current_segments = []
+            single_timestamp_ending = (
+                len(tokens) >= 2
+                and tokens[-2] < tokenizer.timestamp_begin
+                and tokens[-1] >= tokenizer.timestamp_begin
+            )
+            consecutive_timestamps = [
+                i
+                for i in range(len(tokens))
+                if i > 0
+                and tokens[i] >= tokenizer.timestamp_begin
+                and tokens[i - 1] >= tokenizer.timestamp_begin
+            ]
+            if len(consecutive_timestamps) > 0:
+                slices = list(consecutive_timestamps)
+                if single_timestamp_ending:
+                    slices.append(len(tokens))
+                last_slice = 0
+                for current_slice in slices:
+                    sliced_tokens = tokens[last_slice:current_slice]
+                    start_timestamp_position = (
+                        sliced_tokens[0] - tokenizer.timestamp_begin
+                    )
+                    end_timestamp_position = (
+                        sliced_tokens[-1] - tokenizer.timestamp_begin
+                    )
+                    start_time = (
+                        time_offset + start_timestamp_position * self.time_precision
+                    )
+                    end_time = (
+                        time_offset + end_timestamp_position * self.time_precision
+                    )
+                    current_segments.append(
+                        dict(
+                            seek=seek,
+                            start=start_time,
+                            end=end_time,
+                            tokens=sliced_tokens,
+                        )
+                    )
+                    last_slice = current_slice
+                if single_timestamp_ending:
+                    # single timestamp at the end means no speech after the last timestamp.
+                    seek += segment_size
+                else:
+                    # otherwise, ignore the unfinished segment and seek to the last timestamp
+                    last_timestamp_position = (
+                        tokens[last_slice - 1] - tokenizer.timestamp_begin
+                    )
+                    seek += last_timestamp_position * self.input_stride
+            else:
+                duration = segment_duration
+                timestamps = [
+                    token for token in tokens if token >= tokenizer.timestamp_begin
+                ]
+                if len(timestamps) > 0 and timestamps[-1] != tokenizer.timestamp_begin:
+                    last_timestamp_position = timestamps[-1] - tokenizer.timestamp_begin
+                    duration = last_timestamp_position * self.time_precision
+                current_segments.append(
+                    dict(
+                        seek=seek,
+                        start=time_offset,
+                        end=time_offset + duration,
+                        tokens=tokens,
+                    )
+                )
+                seek += segment_size
+            if options.word_timestamps:
+                self.add_word_timestamps(
+                    current_segments,
+                    tokenizer,
+                    encoder_output,
+                    segment_size,
+                    options.prepend_punctuations,
+                    options.append_punctuations,
+                    last_speech_timestamp=last_speech_timestamp,
+                )
+                word_end_timestamps = [
+                    w["end"] for s in current_segments for w in s["words"]
+                ]
+                if len(word_end_timestamps) > 0:
+                    last_speech_timestamp = word_end_timestamps[-1]
+                if not single_timestamp_ending and len(word_end_timestamps) > 0:
+                    seek_shift = round(
+                        (word_end_timestamps[-1] - time_offset) * self.frames_per_second
+                    )
+                    if seek_shift > 0:
+                        seek = previous_seek + seek_shift
+            encoder_output = None
+            for segment in current_segments:
+                tokens = segment["tokens"]
+                text = tokenizer.decode(tokens)
+                if segment["start"] == segment["end"] or not text.strip():
+                    continue
+                all_tokens.extend(tokens)
+                idx += 1
+                yield Segment(
+                    id=idx,
+                    seek=seek,
+                    start=segment["start"],
+                    end=segment["end"],
+                    text=text,
+                    tokens=tokens,
+                    temperature=temperature,
+                    avg_logprob=avg_logprob,
+                    compression_ratio=compression_ratio,
+                    no_speech_prob=result.no_speech_prob,
+                    words=(
+                        [Word(**word) for word in segment["words"]]
+                        if options.word_timestamps
+                        else None
+                    ),
+                )
+            if (
+                not options.condition_on_previous_text
+                or temperature > options.prompt_reset_on_temperature
+            ):
+                if options.condition_on_previous_text:
+                    self.logger.debug(
+                        "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f",
+                        temperature,
+                        options.prompt_reset_on_temperature,
+                    )
+                prompt_reset_since = len(all_tokens)
+    def encode(self, features: np.ndarray) -> ctranslate2.StorageView:
+        # When the model is running on multiple GPUs, the encoder output should be moved
+        # to the CPU since we don't know which GPU will handle the next job.
+        to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
+        features = np.expand_dims(features, 0)
+        features = get_ctranslate2_storage(features)
+        return self.model.encode(features, to_cpu=to_cpu)
+    def generate_with_fallback(
+        self,
+        encoder_output: ctranslate2.StorageView,
+        prompt: List[int],
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+    ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]:
+        decode_result = None
+        all_results = []
+        below_cr_threshold_results = []
+        max_initial_timestamp_index = int(
+            round(options.max_initial_timestamp / self.time_precision)
+        )
+        for temperature in options.temperatures:
+            if temperature > 0:
+                kwargs = {
+                    "beam_size": 1,
+                    "num_hypotheses": options.best_of,
+                    "sampling_topk": 0,
+                    "sampling_temperature": temperature,
+                }
+            else:
+                kwargs = {
+                    "beam_size": options.beam_size,
+                    "patience": options.patience,
+                }
+            result = self.model.generate(
+                encoder_output,
+                [prompt],
+                length_penalty=options.length_penalty,
+                repetition_penalty=options.repetition_penalty,
+                max_length=self.max_length,
+                return_scores=True,
+                return_no_speech_prob=True,
+                suppress_blank=options.suppress_blank,
+                suppress_tokens=options.suppress_tokens,
+                max_initial_timestamp_index=max_initial_timestamp_index,
+                **kwargs,
+            )[0]
+            tokens = result.sequences_ids[0]
+            # Recover the average log prob from the returned score.
+            seq_len = len(tokens)
+            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
+            avg_logprob = cum_logprob / (seq_len + 1)
+            text = tokenizer.decode(tokens).strip()
+            compression_ratio = get_compression_ratio(text)
+            decode_result = (
+                result,
+                avg_logprob,
+                temperature,
+                compression_ratio,
+            )
+            all_results.append(decode_result)
+            needs_fallback = False
+            if options.compression_ratio_threshold is not None:
+                if compression_ratio > options.compression_ratio_threshold:
+                    needs_fallback = True  # too repetitive
+                    self.logger.debug(
+                        "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
+                        temperature,
+                        compression_ratio,
+                        options.compression_ratio_threshold,
+                    )
+                else:
+                    below_cr_threshold_results.append(decode_result)
+            if (
+                options.log_prob_threshold is not None
+                and avg_logprob < options.log_prob_threshold
+            ):
+                needs_fallback = True  # average log probability is too low
+                self.logger.debug(
+                    "Log probability threshold is not met with temperature %.1f (%f < %f)",
+                    temperature,
+                    avg_logprob,
+                    options.log_prob_threshold,
+                )
+            if (
+                options.no_speech_threshold is not None
+                and result.no_speech_prob > options.no_speech_threshold
+            ):
+                needs_fallback = False  # silence
+            if not needs_fallback:
+                break
+        else:
+            # all failed, select the result with the highest average log probability
+            decode_result = max(
+                below_cr_threshold_results or all_results, key=lambda x: x[1]
+            )
+        return decode_result
+    def get_prompt(
+        self,
+        tokenizer: Tokenizer,
+        previous_tokens: List[int],
+        without_timestamps: bool = False,
+        prefix: Optional[str] = None,
+    ) -> List[int]:
+        prompt = []
+        if previous_tokens:
+            prompt.append(tokenizer.sot_prev)
+            prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
+        prompt.extend(tokenizer.sot_sequence)
+        if without_timestamps:
+            prompt.append(tokenizer.no_timestamps)
+        if prefix:
+            prefix_tokens = tokenizer.encode(" " + prefix.strip())
+            if len(prefix_tokens) >= self.max_length // 2:
+                prefix_tokens = prefix_tokens[: self.max_length // 2 - 1]
+            if not without_timestamps:
+                prompt.append(tokenizer.timestamp_begin)
+            prompt.extend(prefix_tokens)
+        return prompt
+    def add_word_timestamps(
+        self,
+        segments: List[dict],
+        tokenizer: Tokenizer,
+        encoder_output: ctranslate2.StorageView,
+        num_frames: int,
+        prepend_punctuations: str,
+        append_punctuations: str,
+        last_speech_timestamp: float,
+    ):
+        if len(segments) == 0:
+            return
+        text_tokens_per_segment = [
+            [token for token in segment["tokens"] if token < tokenizer.eot]
+            for segment in segments
+        ]
+        text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment))
+        alignment = self.find_alignment(
+            tokenizer, text_tokens, encoder_output, num_frames
+        )
+        word_durations = np.array([word["end"] - word["start"] for word in alignment])
+        word_durations = word_durations[word_durations.nonzero()]
+        median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
+        max_duration = median_duration * 2
+        # hack: truncate long words at sentence boundaries.
+        # a better segmentation algorithm based on VAD should be able to replace this.
+        if len(word_durations) > 0:
+            sentence_end_marks = ".。!！?？"
+            # ensure words at sentence boundaries
+            # are not longer than twice the median word duration.
+            for i in range(1, len(alignment)):
+                if alignment[i]["end"] - alignment[i]["start"] > max_duration:
+                    if alignment[i]["word"] in sentence_end_marks:
+                        alignment[i]["end"] = alignment[i]["start"] + max_duration
+                    elif alignment[i - 1]["word"] in sentence_end_marks:
+                        alignment[i]["start"] = alignment[i]["end"] - max_duration
+        merge_punctuations(alignment, prepend_punctuations, append_punctuations)
+        time_offset = (
+            segments[0]["seek"]
+            * self.feature_extractor.hop_length
+            / self.feature_extractor.sampling_rate
+        )
+        word_index = 0
+        for segment, text_tokens in zip(segments, text_tokens_per_segment):
+            saved_tokens = 0
+            words = []
+            while word_index < len(alignment) and saved_tokens < len(text_tokens):
+                timing = alignment[word_index]
+                if timing["word"]:
+                    words.append(
+                        dict(
+                            word=timing["word"],
+                            start=round(time_offset + timing["start"], 2),
+                            end=round(time_offset + timing["end"], 2),
+                            probability=timing["probability"],
+                        )
+                    )
+                saved_tokens += len(timing["tokens"])
+                word_index += 1
+            # hack: truncate long words at segment boundaries.
+            # a better segmentation algorithm based on VAD should be able to replace this.
+            if len(words) > 0:
+                # ensure the first and second word after a pause is not longer than
+                # twice the median word duration.
+                if words[0]["end"] - last_speech_timestamp > median_duration * 4 and (
+                    words[0]["end"] - words[0]["start"] > max_duration
+                    or (
+                        len(words) > 1
+                        and words[1]["end"] - words[0]["start"] > max_duration * 2
+                    )
+                ):
+                    if (
+                        len(words) > 1
+                        and words[1]["end"] - words[1]["start"] > max_duration
+                    ):
+                        boundary = max(
+                            words[1]["end"] / 2, words[1]["end"] - max_duration
+                        )
+                        words[0]["end"] = words[1]["start"] = boundary
+                    words[0]["start"] = max(0, words[0]["end"] - max_duration)
+                # prefer the segment-level start timestamp if the first word is too long.
+                if (
+                    segment["start"] < words[0]["end"]
+                    and segment["start"] - 0.5 > words[0]["start"]
+                ):
+                    words[0]["start"] = max(
+                        0, min(words[0]["end"] - median_duration, segment["start"])
+                    )
+                else:
+                    segment["start"] = words[0]["start"]
+                # prefer the segment-level end timestamp if the last word is too long.
+                if (
+                    segment["end"] > words[-1]["start"]
+                    and segment["end"] + 0.5 < words[-1]["end"]
+                ):
+                    words[-1]["end"] = max(
+                        words[-1]["start"] + median_duration, segment["end"]
+                    )
+                else:
+                    segment["end"] = words[-1]["end"]
+                last_speech_timestamp = segment["end"]
+            segment["words"] = words
+    def find_alignment(
+        self,
+        tokenizer: Tokenizer,
+        text_tokens: List[int],
+        encoder_output: ctranslate2.StorageView,
+        num_frames: int,
+        median_filter_width: int = 7,
+    ) -> List[dict]:
+        if len(text_tokens) == 0:
+            return []
+        result = self.model.align(
+            encoder_output,
+            tokenizer.sot_sequence,
+            [text_tokens],
+            num_frames,
+            median_filter_width=median_filter_width,
+        )[0]
+        text_token_probs = result.text_token_probs
+        alignments = result.alignments
+        text_indices = np.array([pair[0] for pair in alignments])
+        time_indices = np.array([pair[1] for pair in alignments])
+        words, word_tokens = tokenizer.split_to_word_tokens(
+            text_tokens + [tokenizer.eot]
+        )
+        word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
+        if len(word_boundaries) <= 1:
+            return []
+        jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+        jump_times = time_indices[jumps] / self.tokens_per_second
+        start_times = jump_times[word_boundaries[:-1]]
+        end_times = jump_times[word_boundaries[1:]]
+        word_probabilities = [
+            np.mean(text_token_probs[i:j])
+            for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+        ]
+        return [
+            dict(
+                word=word, tokens=tokens, start=start, end=end, probability=probability
+            )
+            for word, tokens, start, end, probability in zip(
+                words, word_tokens, start_times, end_times, word_probabilities
+            )
+        ]
+def restore_speech_timestamps(
+    segments: Iterable[Segment],
+    speech_chunks: List[dict],
+    sampling_rate: int,
+) -> Iterable[Segment]:
+    ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+    for segment in segments:
+        if segment.words:
+            words = []
+            for word in segment.words:
+                # Ensure the word start and end times are resolved to the same chunk.
+                middle = (word.start + word.end) / 2
+                chunk_index = ts_map.get_chunk_index(middle)
+                word = word._replace(
+                    start=ts_map.get_original_time(word.start, chunk_index),
+                    end=ts_map.get_original_time(word.end, chunk_index),
+                )
+                words.append(word)
+            segment = segment._replace(
+                start=words[0].start,
+                end=words[-1].end,
+                words=words,
+            )
+        else:
+            segment = segment._replace(
+                start=ts_map.get_original_time(segment.start),
+                end=ts_map.get_original_time(segment.end),
+            )
+        yield segment
+def get_ctranslate2_storage(segment: np.ndarray) -> ctranslate2.StorageView:
+    segment = np.ascontiguousarray(segment)
+    segment = ctranslate2.StorageView.from_array(segment)
+    return segment
+def get_compression_ratio(text: str) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def get_suppressed_tokens(tokenizer, suppress_tokens):
+    if not suppress_tokens or -1 in suppress_tokens:
+        return suppress_tokens
+    suppress_tokens = list(suppress_tokens)
+    # Ensure the following special tokens are suppressed when the user does
+    # not use the default set (-1).
+    suppress_tokens.extend(
+        [
+            tokenizer.transcribe,
+            tokenizer.translate,
+            tokenizer.sot,
+            tokenizer.sot_prev,
+            tokenizer.sot_lm,
+        ]
+    )
+    return sorted(set(suppress_tokens))
+def merge_punctuations(alignment: List[dict], prepended: str, appended: str):
+    # merge prepended punctuations
+    i = len(alignment) - 2
+    j = len(alignment) - 1
+    while i >= 0:
+        previous = alignment[i]
+        following = alignment[j]
+        if previous["word"].startswith(" ") and previous["word"].strip() in prepended:
+            # prepend it to the following word
+            following["word"] = previous["word"] + following["word"]
+            following["tokens"] = previous["tokens"] + following["tokens"]
+            previous["word"] = ""
+            previous["tokens"] = []
+        else:
+            j = i
+        i -= 1
+    # merge appended punctuations
+    i = 0
+    j = 1
+    while j < len(alignment):
+        previous = alignment[i]
+        following = alignment[j]
+        if not previous["word"].endswith(" ") and following["word"] in appended:
+            # append it to the previous word
+            previous["word"] = previous["word"] + following["word"]
+            previous["tokens"] = previous["tokens"] + following["tokens"]
+            following["word"] = ""
+            following["tokens"] = []
+        else:
+            i = j
+        j += 1