{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "iyLoWDsb9rEs"
   },
   "outputs": [],
   "source": [
    "# unzip the audio files from commom voice dataset with Turkish language and Portuguese language\n",
    "#! tar -xf data/cv-corpus-15.0-2023-09-08-pt.tar.gz\n",
    "#! tar -xf data/cv-corpus-15.0-2023-09-08-tr.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/User/en_tr_pt_titanet_large\n"
     ]
    }
   ],
   "source": [
    "# Convert the mp3 files to wav files with 16kHz sampling rate and 16 bits, 1 channel\n",
    "import os\n",
    "NEMO_ROOT = os.getcwd()\n",
    "print(NEMO_ROOT)\n",
    "import glob\n",
    "import subprocess\n",
    "\n",
    "data_dir = os.path.join(NEMO_ROOT,'data')\n",
    "#os.makedirs(data_dir, exist_ok=True)\n",
    "\n",
    "#print(\"Converting .mp3 to .wav...\")\n",
    "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/pt/clips/*.mp3', recursive=True)\n",
    "#for mp3_path in mp3_list:\n",
    "#    wav_path = mp3_path[:-4] + '.wav'\n",
    "#    cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n",
    "#    subprocess.run(cmd)\n",
    "#print(\"Finished conversion.\\n******\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(\"Converting .mp3 to .wav...\")\n",
    "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/tr/clips/*.mp3', recursive=True)\n",
    "#for mp3_path in mp3_list:\n",
    "#    wav_path = mp3_path[:-4] + '.wav'\n",
    "#    cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n",
    "#    subprocess.run(cmd)\n",
    "#print(\"Finished conversion.\\n******\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "vqUBayc_Ctcr"
   },
   "outputs": [],
   "source": [
    "# prepare the train, dev, test dataset for Portuguese language\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "#pt_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/clip_durations.tsv', sep='\\t')\n",
    "#pt_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/train.tsv', sep='\\t')\n",
    "#pt_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/dev.tsv', sep='\\t')\n",
    "#pt_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/test.tsv', sep='\\t')\n",
    "\n",
    "#merged_pt_train_df = pd.merge(pt_train_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "#merged_pt_dev_df = pd.merge(pt_dev_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "#merged_pt_test_df = pd.merge(pt_test_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merged_pt_train_df['audio_filepath'] = merged_pt_train_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
    "#merged_pt_dev_df['audio_filepath'] = merged_pt_dev_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
    "#merged_pt_test_df['audio_filepath'] = merged_pt_test_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
    "\n",
    "#merged_pt_train_df[\"audio_filepath\"] = merged_pt_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "#merged_pt_dev_df[\"audio_filepath\"] = merged_pt_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "#merged_pt_test_df[\"audio_filepath\"] = merged_pt_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "\n",
    "#merged_pt_train_df['duration'] = merged_pt_train_df['duration'].apply(lambda x: x / 1000)\n",
    "#merged_pt_dev_df['duration'] = merged_pt_dev_df['duration'].apply(lambda x: x / 1000)\n",
    "#merged_pt_test_df['duration'] = merged_pt_test_df['duration'].apply(lambda x: x / 1000)\n",
    "\n",
    "#merged_pt_train_df = merged_pt_train_df[['audio_filepath', 'duration', 'label']]\n",
    "#merged_pt_dev_df = merged_pt_dev_df[['audio_filepath', 'duration', 'label']]\n",
    "#merged_pt_test_df = merged_pt_test_df[['audio_filepath', 'duration', 'label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>audio_filepath</th>\n",
       "      <th>duration</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>6.504</td>\n",
       "      <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>4.656</td>\n",
       "      <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>3.504</td>\n",
       "      <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>3.456</td>\n",
       "      <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>4.224</td>\n",
       "      <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21052</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>4.860</td>\n",
       "      <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21053</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>2.196</td>\n",
       "      <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21054</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>2.124</td>\n",
       "      <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21055</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>1.908</td>\n",
       "      <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21056</th>\n",
       "      <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
       "      <td>5.436</td>\n",
       "      <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>21057 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          audio_filepath  duration  \\\n",
       "0      /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     6.504   \n",
       "1      /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     4.656   \n",
       "2      /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     3.504   \n",
       "3      /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     3.456   \n",
       "4      /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     4.224   \n",
       "...                                                  ...       ...   \n",
       "21052  /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     4.860   \n",
       "21053  /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     2.196   \n",
       "21054  /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     2.124   \n",
       "21055  /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     1.908   \n",
       "21056  /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...     5.436   \n",
       "\n",
       "                                                   label  \n",
       "0      c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...  \n",
       "1      c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...  \n",
       "2      c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...  \n",
       "3      c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...  \n",
       "4      c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...  \n",
       "...                                                  ...  \n",
       "21052  d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...  \n",
       "21053  d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...  \n",
       "21054  d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...  \n",
       "21055  d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...  \n",
       "21056  d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...  \n",
       "\n",
       "[21057 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#merged_pt_train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "vnrUh3vuDSRN"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "# prepare the train, dev, test dataset for Turkish language\n",
    "tr_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/clip_durations.tsv', sep='\\t')\n",
    "tr_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/train.tsv', sep='\\t')\n",
    "tr_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/dev.tsv', sep='\\t')\n",
    "tr_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/test.tsv', sep='\\t')\n",
    "\n",
    "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
    "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-5-81ac8797cb7a>:5: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
      "<ipython-input-5-81ac8797cb7a>:6: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
      "<ipython-input-5-81ac8797cb7a>:7: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n"
     ]
    }
   ],
   "source": [
    "\n",
    "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/Users/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
    "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
    "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
    "\n",
    "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
    "\n",
    "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n",
    "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n",
    "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n",
    "\n",
    "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n",
    "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n",
    "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_tr_train_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/train.json', orient='records', lines=True)\n",
    "merged_tr_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/dev.json', orient='records', lines=True)\n",
    "merged_tr_test_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/test.json', orient='records', lines=True)\n",
    "\n",
    "#merged_pt_train_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/train.json', orient='records', lines=True)\n",
    "#merged_pt_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/dev.json', orient='records', lines=True)\n",
    "#merged_pt_test_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/test.json', orient='records', lines=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "name: TitaNet-Finetune\n",
      "sample_rate: 16000\n",
      "init_from_pretrained_model:\n",
      "  speaker_tasks:\n",
      "    name: titanet_large\n",
      "    include:\n",
      "    - preprocessor\n",
      "    - encoder\n",
      "    exclude:\n",
      "    - decoder.final\n",
      "model:\n",
      "  train_ds:\n",
      "    manifest_filepath: ???\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 64\n",
      "    shuffle: true\n",
      "    is_tarred: false\n",
      "    tarred_audio_filepaths: null\n",
      "    tarred_shard_strategy: scatter\n",
      "    augmentor:\n",
      "      speed:\n",
      "        prob: 0.3\n",
      "        sr: 16000\n",
      "        resample_type: kaiser_fast\n",
      "        min_speed_rate: 0.95\n",
      "        max_speed_rate: 1.05\n",
      "  validation_ds:\n",
      "    manifest_filepath: ???\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 128\n",
      "    shuffle: false\n",
      "  test_ds:\n",
      "    manifest_filepath: ???\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 1\n",
      "    shuffle: false\n",
      "    embedding_dir: ./embeddings\n",
      "  model_defaults:\n",
      "    filters: 1024\n",
      "    repeat: 3\n",
      "    dropout: 0.1\n",
      "    separable: true\n",
      "    se: true\n",
      "    se_context_size: -1\n",
      "    kernel_size_factor: 1.0\n",
      "  preprocessor:\n",
      "    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor\n",
      "    normalize: per_feature\n",
      "    window_size: 0.025\n",
      "    sample_rate: 16000\n",
      "    window_stride: 0.01\n",
      "    window: hann\n",
      "    features: 80\n",
      "    n_fft: 512\n",
      "    frame_splicing: 1\n",
      "    dither: 1.0e-05\n",
      "  encoder:\n",
      "    _target_: nemo.collections.asr.modules.ConvASREncoder\n",
      "    feat_in: 80\n",
      "    activation: relu\n",
      "    conv_mask: true\n",
      "    jasper:\n",
      "    - filters: ${model.model_defaults.filters}\n",
      "      repeat: 1\n",
      "      kernel:\n",
      "      - 3\n",
      "      stride:\n",
      "      - 1\n",
      "      dilation:\n",
      "      - 1\n",
      "      dropout: 0.0\n",
      "      residual: false\n",
      "      separable: ${model.model_defaults.separable}\n",
      "      se: ${model.model_defaults.se}\n",
      "      se_context_size: ${model.model_defaults.se_context_size}\n",
      "    - filters: ${model.model_defaults.filters}\n",
      "      repeat: ${model.model_defaults.repeat}\n",
      "      kernel:\n",
      "      - 7\n",
      "      stride:\n",
      "      - 1\n",
      "      dilation:\n",
      "      - 1\n",
      "      dropout: ${model.model_defaults.dropout}\n",
      "      residual: true\n",
      "      separable: ${model.model_defaults.separable}\n",
      "      se: ${model.model_defaults.se}\n",
      "      se_context_size: ${model.model_defaults.se_context_size}\n",
      "    - filters: ${model.model_defaults.filters}\n",
      "      repeat: ${model.model_defaults.repeat}\n",
      "      kernel:\n",
      "      - 11\n",
      "      stride:\n",
      "      - 1\n",
      "      dilation:\n",
      "      - 1\n",
      "      dropout: ${model.model_defaults.dropout}\n",
      "      residual: true\n",
      "      separable: ${model.model_defaults.separable}\n",
      "      se: ${model.model_defaults.se}\n",
      "      se_context_size: ${model.model_defaults.se_context_size}\n",
      "    - filters: ${model.model_defaults.filters}\n",
      "      repeat: ${model.model_defaults.repeat}\n",
      "      kernel:\n",
      "      - 15\n",
      "      stride:\n",
      "      - 1\n",
      "      dilation:\n",
      "      - 1\n",
      "      dropout: ${model.model_defaults.dropout}\n",
      "      residual: true\n",
      "      separable: ${model.model_defaults.separable}\n",
      "      se: ${model.model_defaults.se}\n",
      "      se_context_size: ${model.model_defaults.se_context_size}\n",
      "    - filters: 3072\n",
      "      repeat: 1\n",
      "      kernel:\n",
      "      - 1\n",
      "      stride:\n",
      "      - 1\n",
      "      dilation:\n",
      "      - 1\n",
      "      dropout: 0.0\n",
      "      residual: false\n",
      "      separable: ${model.model_defaults.separable}\n",
      "      se: ${model.model_defaults.se}\n",
      "      se_context_size: ${model.model_defaults.se_context_size}\n",
      "  decoder:\n",
      "    _target_: nemo.collections.asr.modules.SpeakerDecoder\n",
      "    feat_in: 3072\n",
      "    num_classes: ???\n",
      "    pool_mode: attention\n",
      "    emb_sizes: 192\n",
      "  loss:\n",
      "    _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss\n",
      "    scale: 30\n",
      "    margin: 0.2\n",
      "  optim_param_groups:\n",
      "    encoder:\n",
      "      lr: 0.001\n",
      "  optim:\n",
      "    name: adamw\n",
      "    lr: 0.0001\n",
      "    weight_decay: 0.0002\n",
      "    sched:\n",
      "      name: CosineAnnealing\n",
      "      warmup_ratio: 0.1\n",
      "      min_lr: 0.0\n",
      "trainer:\n",
      "  devices: 1\n",
      "  max_epochs: 10\n",
      "  max_steps: -1\n",
      "  num_nodes: 1\n",
      "  accelerator: gpu\n",
      "  strategy: ddp\n",
      "  deterministic: true\n",
      "  enable_checkpointing: false\n",
      "  logger: false\n",
      "  log_every_n_steps: 1\n",
      "  val_check_interval: 1.0\n",
      "  gradient_clip_val: 1.0\n",
      "exp_manager:\n",
      "  exp_dir: null\n",
      "  name: TitaNet-Finetune\n",
      "  create_tensorboard_logger: true\n",
      "  create_checkpoint_callback: true\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Set up the config for fine-tuning\n",
    "from omegaconf import OmegaConf\n",
    "finetune_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
    "print(OmegaConf.to_yaml(finetune_config))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fine-tune the model with Portuguese language\n",
    "\n",
    "import torch\n",
    "import pytorch_lightning as pl\n",
    "import nemo\n",
    "import nemo.collections.asr as nemo_asr\n",
    "from omegaconf import OmegaConf\n",
    "from nemo.utils.exp_manager import exp_manager\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "pt_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
    "## set up the trainer\n",
    "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
    "\n",
    "pt_trainer_config = OmegaConf.create(dict(\n",
    "    devices=4,\n",
    "    accelerator=accelerator,\n",
    "    max_epochs=5,\n",
    "    max_steps=-1,  # computed at runtime if not set\n",
    "    num_nodes=1,\n",
    "    accumulate_grad_batches=1,\n",
    "    enable_checkpointing=False,  # Provided by exp_manager\n",
    "    logger=False,  # Provided by exp_manager\n",
    "    log_every_n_steps=1,  # Interval of logging.\n",
    "    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
    "))\n",
    "print(OmegaConf.to_yaml(pt_trainer_config))\n",
    "pt_trainer_finetune = pl.Trainer(**pt_trainer_config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#set up the nemo experiment for logging and monitoring purpose\n",
    "log_dir_finetune = exp_manager(trainer=pt_trainer_finetune, config=pt_config, name='titanet_finetune_pt').get_save_dir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up the manifest file for Portuguese language\n",
    "pt_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/train.json'\n",
    "pt_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/dev.json'\n",
    "pt_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/test.json'\n",
    "pt_config.model.decoder.num_classes = merged_pt_train_df['label'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up the model for Portuguese language and train the model\n",
    "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=pt_config.model, trainer=trainer_finetune)\n",
    "speaker_model.maybe_init_from_pretrained_checkpoint(pt_config)\n",
    "\n",
    "pt_trainer_finetune.fit(speaker_model)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the model after fine-tuning with Portuguese language\n",
    "speaker_model.save_to('titanet_finetune_pt.nemo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "devices: 1\n",
      "accelerator: cpu\n",
      "max_epochs: 5\n",
      "max_steps: -1\n",
      "num_nodes: 1\n",
      "accumulate_grad_batches: 1\n",
      "enable_checkpointing: false\n",
      "logger: false\n",
      "log_every_n_steps: 1\n",
      "val_check_interval: 1.0\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "GPU available: False, used: False\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "HPU available: False, using: 0 HPUs\n",
      "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-25 05:15:08 exp_manager:381] Experiments will be logged at /User/en_tr_pt_titanet_large/nemo_experiments/TitaNet-Finetune/2023-09-25_04-36-46\n",
      "[NeMo I 2023-09-25 05:15:08 exp_manager:815] TensorboardLogger has been set up\n",
      "[NeMo I 2023-09-25 05:15:08 exp_manager:930] Preemption is supported only on GPUs, disabling preemption\n",
      "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of  29.37 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-25 05:31:31 label_models:187] Total number of 24 found in all the manifest files.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of  29.37 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n",
      "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10502 items, total duration of  10.23 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:304] # 10502 files loaded accounting to # 128 labels\n",
      "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is  0.00 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10880 items, total duration of  12.25 hours.\n",
      "[NeMo I 2023-09-25 05:31:31 collections:304] # 10880 files loaded accounting to # 1244 labels\n",
      "[NeMo I 2023-09-25 05:31:31 features:289] PADDING: 16\n",
      "[NeMo I 2023-09-25 05:31:32 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n",
      "[NeMo I 2023-09-25 05:31:38 common:913] Instantiating model from pre-trained checkpoint\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-25 05:31:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
      "    Train config : \n",
      "    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 64\n",
      "    shuffle: true\n",
      "    is_tarred: false\n",
      "    tarred_audio_filepaths: null\n",
      "    tarred_shard_strategy: scatter\n",
      "    augmentor:\n",
      "      noise:\n",
      "        manifest_path: /manifests/noise/rir_noise_manifest.json\n",
      "        prob: 0.5\n",
      "        min_snr_db: 0\n",
      "        max_snr_db: 15\n",
      "      speed:\n",
      "        prob: 0.5\n",
      "        sr: 16000\n",
      "        resample_type: kaiser_fast\n",
      "        min_speed_rate: 0.95\n",
      "        max_speed_rate: 1.05\n",
      "    num_workers: 15\n",
      "    pin_memory: true\n",
      "    \n",
      "[NeMo W 2023-09-25 05:31:38 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
      "    Validation config : \n",
      "    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n",
      "    sample_rate: 16000\n",
      "    labels: null\n",
      "    batch_size: 128\n",
      "    shuffle: false\n",
      "    num_workers: 15\n",
      "    pin_memory: true\n",
      "    \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2023-09-25 05:31:38 features:289] PADDING: 16\n",
      "[NeMo I 2023-09-25 05:31:39 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
      "[NeMo I 2023-09-25 05:31:39 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n",
      "[NeMo I 2023-09-25 05:31:39 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n",
      "[NeMo I 2023-09-25 05:31:39 modelPT:1156] Make sure that this is what you wanted!\n",
      "[NeMo I 2023-09-25 05:31:39 modelPT:735] Optimizer config = AdamW (\n",
      "    Parameter Group 0\n",
      "        amsgrad: False\n",
      "        betas: (0.9, 0.999)\n",
      "        capturable: False\n",
      "        eps: 1e-08\n",
      "        foreach: None\n",
      "        lr: 0.0001\n",
      "        maximize: False\n",
      "        weight_decay: 0.0002\n",
      "    \n",
      "    Parameter Group 1\n",
      "        amsgrad: False\n",
      "        betas: (0.9, 0.999)\n",
      "        capturable: False\n",
      "        eps: 1e-08\n",
      "        foreach: None\n",
      "        lr: 0.001\n",
      "        maximize: False\n",
      "        weight_decay: 0.0002\n",
      "    )\n",
      "[NeMo I 2023-09-25 05:31:39 lr_scheduler:910] Scheduler \"<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fc250660ac0>\" \n",
      "    will be used during training (effective maximum steps = 2430) - \n",
      "    Parameters : \n",
      "    (warmup_ratio: 0.1\n",
      "    min_lr: 0.0\n",
      "    max_steps: 2430\n",
      "    )\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  | Name            | Type                              | Params\n",
      "----------------------------------------------------------------------\n",
      "0 | loss            | AngularSoftmaxLoss                | 0     \n",
      "1 | eval_loss       | AngularSoftmaxLoss                | 0     \n",
      "2 | _accuracy       | TopKClassificationAccuracy        | 0     \n",
      "3 | preprocessor    | AudioToMelSpectrogramPreprocessor | 0     \n",
      "4 | encoder         | ConvASREncoder                    | 19.4 M\n",
      "5 | decoder         | SpeakerDecoder                    | 2.8 M \n",
      "6 | _macro_accuracy | MulticlassAccuracy                | 0     \n",
      "----------------------------------------------------------------------\n",
      "22.1 M    Trainable params\n",
      "0         Non-trainable params\n",
      "22.1 M    Total params\n",
      "88.508    Total estimated model params size (MB)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8a6fa6c7b4214098b48c00a8562b8051",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Sanity Checking: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2023-09-25 05:31:39 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
      "      rank_zero_warn(\n",
      "    \n",
      "[NeMo E 2023-09-25 05:31:39 segment:249] Loading /User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav via SoundFile raised RuntimeError: `Error opening '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav': System error.`. NeMo will fallback to loading via pydub.\n"
     ]
    },
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[16], line 45\u001b[0m\n\u001b[1;32m     43\u001b[0m speaker_model \u001b[38;5;241m=\u001b[39m nemo_asr\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mEncDecSpeakerLabelModel(cfg\u001b[38;5;241m=\u001b[39mtr_config\u001b[38;5;241m.\u001b[39mmodel, trainer\u001b[38;5;241m=\u001b[39mtr_trainer_finetune)\n\u001b[1;32m     44\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39mmaybe_init_from_pretrained_checkpoint(tr_config)\n\u001b[0;32m---> 45\u001b[0m \u001b[43mtr_trainer_finetune\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mspeaker_model\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;66;03m# Save the model after fine-tuning with Turkish language\u001b[39;00m\n\u001b[1;32m     49\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39msave_to(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitanet_finetune_tr.nemo\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:532\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m_lightning_module \u001b[38;5;241m=\u001b[39m model\n\u001b[1;32m    531\u001b[0m _verify_strategy_supports_compile(model, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy)\n\u001b[0;32m--> 532\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    533\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    534\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:43\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     41\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     42\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     45\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m     46\u001b[0m     _call_teardown_hook(trainer)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:571\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_connector\u001b[38;5;241m.\u001b[39mattach_data(\n\u001b[1;32m    562\u001b[0m     model, train_dataloaders\u001b[38;5;241m=\u001b[39mtrain_dataloaders, val_dataloaders\u001b[38;5;241m=\u001b[39mval_dataloaders, datamodule\u001b[38;5;241m=\u001b[39mdatamodule\n\u001b[1;32m    563\u001b[0m )\n\u001b[1;32m    565\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m    566\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m    567\u001b[0m     ckpt_path,\n\u001b[1;32m    568\u001b[0m     model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    569\u001b[0m     model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    570\u001b[0m )\n\u001b[0;32m--> 571\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m    574\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:980\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m    975\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m    977\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    978\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m    979\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 980\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    982\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    983\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m    984\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    985\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1021\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1019\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m   1020\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m isolate_rng():\n\u001b[0;32m-> 1021\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sanity_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1022\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[1;32m   1023\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit_loop\u001b[38;5;241m.\u001b[39mrun()\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1050\u001b[0m, in \u001b[0;36mTrainer._run_sanity_check\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1047\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_start\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1049\u001b[0m \u001b[38;5;66;03m# run eval step\u001b[39;00m\n\u001b[0;32m-> 1050\u001b[0m \u001b[43mval_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_end\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:181\u001b[0m, in \u001b[0;36m_no_grad_context.<locals>._decorator\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    179\u001b[0m     context_manager \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mno_grad\n\u001b[1;32m    180\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context_manager():\n\u001b[0;32m--> 181\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloop_run\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/evaluation_loop.py:108\u001b[0m, in \u001b[0;36m_EvaluationLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    106\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 108\u001b[0m         batch, batch_idx, dataloader_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata_fetcher\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    109\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_progress\u001b[38;5;241m.\u001b[39mis_last_batch \u001b[38;5;241m=\u001b[39m data_fetcher\u001b[38;5;241m.\u001b[39mdone\n\u001b[1;32m    110\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m previous_dataloader_idx \u001b[38;5;241m!=\u001b[39m dataloader_idx:\n\u001b[1;32m    111\u001b[0m             \u001b[38;5;66;03m# the dataloader has changed, notify the logger connector\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:137\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    134\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone:\n\u001b[1;32m    135\u001b[0m     \u001b[38;5;66;03m# this will run only when no pre-fetching was done.\u001b[39;00m\n\u001b[1;32m    136\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fetch_next_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataloader_iter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    138\u001b[0m         \u001b[38;5;66;03m# consume the batch we just fetched\u001b[39;00m\n\u001b[1;32m    139\u001b[0m         batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatches\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;241m0\u001b[39m)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:151\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher._fetch_next_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m    149\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_start_profiler()\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m     batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    153\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop_profiler()\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:285\u001b[0m, in \u001b[0;36mCombinedLoader.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m    284\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    286\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator, _Sequential):\n\u001b[1;32m    287\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:123\u001b[0m, in \u001b[0;36m_Sequential.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    120\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m\n\u001b[1;32m    122\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 123\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterators\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    124\u001b[0m     index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx\n\u001b[1;32m    125\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    626\u001b[0m     \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m    627\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()  \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    631\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m    632\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    670\u001b[0m     index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index()  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m    672\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m    673\u001b[0m         data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m     56\u001b[0m         data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m     57\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m         data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     60\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     56\u001b[0m         data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m     57\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m         data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     60\u001b[0m     data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/data/audio_to_label.py:327\u001b[0m, in \u001b[0;36m_AudioLabelDataset.__getitem__\u001b[0;34m(self, index)\u001b[0m\n\u001b[1;32m    324\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m offset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    325\u001b[0m     offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 327\u001b[0m features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeaturizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    328\u001b[0m f, fl \u001b[38;5;241m=\u001b[39m features, torch\u001b[38;5;241m.\u001b[39mtensor(features\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mlong()\n\u001b[1;32m    330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_regression_task:\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/features.py:186\u001b[0m, in \u001b[0;36mWaveformFeaturizer.process\u001b[0;34m(self, file_path, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db)\u001b[0m\n\u001b[1;32m    172\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess\u001b[39m(\n\u001b[1;32m    173\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    174\u001b[0m     file_path,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    184\u001b[0m     normalize_db\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    185\u001b[0m ):\n\u001b[0;32m--> 186\u001b[0m     audio \u001b[38;5;241m=\u001b[39m \u001b[43mAudioSegment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    187\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    188\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtarget_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample_rate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    189\u001b[0m \u001b[43m        \u001b[49m\u001b[43mint_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    190\u001b[0m \u001b[43m        \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    191\u001b[0m \u001b[43m        \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    192\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    193\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrim_ref\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_ref\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    194\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrim_top_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_top_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    195\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrim_frame_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_frame_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    196\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrim_hop_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_hop_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    197\u001b[0m \u001b[43m        \u001b[49m\u001b[43morig_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morig_sr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    198\u001b[0m \u001b[43m        \u001b[49m\u001b[43mchannel_selector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchannel_selector\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    199\u001b[0m \u001b[43m        \u001b[49m\u001b[43mnormalize_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnormalize_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    200\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    201\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocess_segment(audio)\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/segment.py:259\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, audio_file, target_sr, int_values, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel)\u001b[0m\n\u001b[1;32m    257\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m HAVE_PYDUB \u001b[38;5;129;01mand\u001b[39;00m samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    258\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 259\u001b[0m         samples \u001b[38;5;241m=\u001b[39m \u001b[43mAudio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    260\u001b[0m         sample_rate \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mframe_rate\n\u001b[1;32m    261\u001b[0m         num_channels \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mchannels\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/audio_segment.py:651\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, file, format, codec, parameters, start_second, duration, **kwargs)\u001b[0m\n\u001b[1;32m    649\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m    650\u001b[0m     filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 651\u001b[0m file, close_file \u001b[38;5;241m=\u001b[39m \u001b[43m_fd_or_path_or_tempfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtempfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mformat\u001b[39m:\n\u001b[1;32m    654\u001b[0m     \u001b[38;5;28mformat\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mlower()\n",
      "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/utils.py:60\u001b[0m, in \u001b[0;36m_fd_or_path_or_tempfile\u001b[0;34m(fd, mode, tempfile)\u001b[0m\n\u001b[1;32m     57\u001b[0m     close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(fd, basestring):\n\u001b[0;32m---> 60\u001b[0m     fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     61\u001b[0m     close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'"
     ]
    }
   ],
   "source": [
    "# Fine-tune the model with Portuguese language\n",
    "\n",
    "import torch\n",
    "import pytorch_lightning as pl\n",
    "import nemo\n",
    "import nemo.collections.asr as nemo_asr\n",
    "from omegaconf import OmegaConf\n",
    "from nemo.utils.exp_manager import exp_manager\n",
    "\n",
    "# Fine-tune the model with Turkish language\n",
    "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
    "## set up the trainer\n",
    "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
    "\n",
    "tr_trainer_config = OmegaConf.create(dict(\n",
    "    devices=4,\n",
    "    accelerator=accelerator,\n",
    "    max_epochs=5,\n",
    "    max_steps=-1,  # computed at runtime if not set\n",
    "    num_nodes=1,\n",
    "    accumulate_grad_batches=1,\n",
    "    enable_checkpointing=False,  # Provided by exp_manager\n",
    "    logger=False,  # Provided by exp_manager\n",
    "    log_every_n_steps=1,  # Interval of logging.\n",
    "    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
    "))\n",
    "print(OmegaConf.to_yaml(tr_trainer_config))\n",
    "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n",
    "\n",
    "\n",
    "#set up the nemo experiment for logging and monitoring purpose\n",
    "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n",
    "\n",
    "\n",
    "# set up the manifest file for Turkish language\n",
    "tr_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/train.json'\n",
    "tr_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/dev.json'\n",
    "tr_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/test.json'\n",
    "tr_config.model.decoder.num_classes = merged_tr_train_df['label'].nunique()\n",
    "\n",
    "\n",
    "# set up the model for Turkish language and train the model\n",
    "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n",
    "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n",
    "tr_trainer_finetune.fit(speaker_model)\n",
    "\n",
    "# Save the model after fine-tuning with Turkish language\n",
    "\n",
    "speaker_model.save_to('titanet_finetune_tr.nemo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Speaker_Recogniton_Verification.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "transcribe",
   "language": "python",
   "name": "conda-env-.conda-transcribe-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}