{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "iyLoWDsb9rEs" }, "outputs": [], "source": [ "# unzip the audio files from commom voice dataset with Turkish language and Portuguese language\n", "#! tar -xf data/cv-corpus-15.0-2023-09-08-pt.tar.gz\n", "#! tar -xf data/cv-corpus-15.0-2023-09-08-tr.tar.gz" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/User/en_tr_pt_titanet_large\n" ] } ], "source": [ "# Convert the mp3 files to wav files with 16kHz sampling rate and 16 bits, 1 channel\n", "import os\n", "NEMO_ROOT = os.getcwd()\n", "print(NEMO_ROOT)\n", "import glob\n", "import subprocess\n", "\n", "data_dir = os.path.join(NEMO_ROOT,'data')\n", "#os.makedirs(data_dir, exist_ok=True)\n", "\n", "#print(\"Converting .mp3 to .wav...\")\n", "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/pt/clips/*.mp3', recursive=True)\n", "#for mp3_path in mp3_list:\n", "# wav_path = mp3_path[:-4] + '.wav'\n", "# cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n", "# subprocess.run(cmd)\n", "#print(\"Finished conversion.\\n******\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#print(\"Converting .mp3 to .wav...\")\n", "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/tr/clips/*.mp3', recursive=True)\n", "#for mp3_path in mp3_list:\n", "# wav_path = mp3_path[:-4] + '.wav'\n", "# cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n", "# subprocess.run(cmd)\n", "#print(\"Finished conversion.\\n******\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "vqUBayc_Ctcr" }, "outputs": [], "source": [ "# prepare the train, dev, test dataset for Portuguese language\n", "import pandas as pd\n", "import os\n", "\n", "#pt_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/clip_durations.tsv', sep='\\t')\n", "#pt_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/train.tsv', sep='\\t')\n", "#pt_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/dev.tsv', sep='\\t')\n", "#pt_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/test.tsv', sep='\\t')\n", "\n", "#merged_pt_train_df = pd.merge(pt_train_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "#merged_pt_dev_df = pd.merge(pt_dev_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "#merged_pt_test_df = pd.merge(pt_test_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#merged_pt_train_df['audio_filepath'] = merged_pt_train_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n", "#merged_pt_dev_df['audio_filepath'] = merged_pt_dev_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n", "#merged_pt_test_df['audio_filepath'] = merged_pt_test_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n", "\n", "#merged_pt_train_df[\"audio_filepath\"] = merged_pt_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "#merged_pt_dev_df[\"audio_filepath\"] = merged_pt_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "#merged_pt_test_df[\"audio_filepath\"] = merged_pt_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "\n", "#merged_pt_train_df['duration'] = merged_pt_train_df['duration'].apply(lambda x: x / 1000)\n", "#merged_pt_dev_df['duration'] = merged_pt_dev_df['duration'].apply(lambda x: x / 1000)\n", "#merged_pt_test_df['duration'] = merged_pt_test_df['duration'].apply(lambda x: x / 1000)\n", "\n", "#merged_pt_train_df = merged_pt_train_df[['audio_filepath', 'duration', 'label']]\n", "#merged_pt_dev_df = merged_pt_dev_df[['audio_filepath', 'duration', 'label']]\n", "#merged_pt_test_df = merged_pt_test_df[['audio_filepath', 'duration', 'label']]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
audio_filepathdurationlabel
0/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...6.504c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...
1/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...4.656c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...
2/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...3.504c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...
3/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...3.456c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...
4/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...4.224c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...
............
21052/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...4.860d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...
21053/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...2.196d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...
21054/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...2.124d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...
21055/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...1.908d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...
21056/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...5.436d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...
\n", "

21057 rows × 3 columns

\n", "
" ], "text/plain": [ " audio_filepath duration \\\n", "0 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 6.504 \n", "1 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.656 \n", "2 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 3.504 \n", "3 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 3.456 \n", "4 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.224 \n", "... ... ... \n", "21052 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.860 \n", "21053 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 2.196 \n", "21054 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 2.124 \n", "21055 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 1.908 \n", "21056 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 5.436 \n", "\n", " label \n", "0 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n", "1 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n", "2 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n", "3 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n", "4 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n", "... ... \n", "21052 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n", "21053 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n", "21054 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n", "21055 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n", "21056 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n", "\n", "[21057 rows x 3 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#merged_pt_train_df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": {}, "colab_type": "code", "id": "vnrUh3vuDSRN" }, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "# prepare the train, dev, test dataset for Turkish language\n", "tr_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/clip_durations.tsv', sep='\\t')\n", "tr_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/train.tsv', sep='\\t')\n", "tr_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/dev.tsv', sep='\\t')\n", "tr_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/test.tsv', sep='\\t')\n", "\n", "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n", "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":5: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", ":6: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", ":7: FutureWarning: The default value of regex will change from True to False in a future version.\n", " merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n" ] } ], "source": [ "\n", "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/Users/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n", "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n", "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n", "\n", "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n", "\n", "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n", "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n", "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n", "\n", "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n", "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n", "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "merged_tr_train_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/train.json', orient='records', lines=True)\n", "merged_tr_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/dev.json', orient='records', lines=True)\n", "merged_tr_test_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/test.json', orient='records', lines=True)\n", "\n", "#merged_pt_train_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/train.json', orient='records', lines=True)\n", "#merged_pt_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/dev.json', orient='records', lines=True)\n", "#merged_pt_test_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/test.json', orient='records', lines=True)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name: TitaNet-Finetune\n", "sample_rate: 16000\n", "init_from_pretrained_model:\n", " speaker_tasks:\n", " name: titanet_large\n", " include:\n", " - preprocessor\n", " - encoder\n", " exclude:\n", " - decoder.final\n", "model:\n", " train_ds:\n", " manifest_filepath: ???\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 64\n", " shuffle: true\n", " is_tarred: false\n", " tarred_audio_filepaths: null\n", " tarred_shard_strategy: scatter\n", " augmentor:\n", " speed:\n", " prob: 0.3\n", " sr: 16000\n", " resample_type: kaiser_fast\n", " min_speed_rate: 0.95\n", " max_speed_rate: 1.05\n", " validation_ds:\n", " manifest_filepath: ???\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 128\n", " shuffle: false\n", " test_ds:\n", " manifest_filepath: ???\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 1\n", " shuffle: false\n", " embedding_dir: ./embeddings\n", " model_defaults:\n", " filters: 1024\n", " repeat: 3\n", " dropout: 0.1\n", " separable: true\n", " se: true\n", " se_context_size: -1\n", " kernel_size_factor: 1.0\n", " preprocessor:\n", " _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor\n", " normalize: per_feature\n", " window_size: 0.025\n", " sample_rate: 16000\n", " window_stride: 0.01\n", " window: hann\n", " features: 80\n", " n_fft: 512\n", " frame_splicing: 1\n", " dither: 1.0e-05\n", " encoder:\n", " _target_: nemo.collections.asr.modules.ConvASREncoder\n", " feat_in: 80\n", " activation: relu\n", " conv_mask: true\n", " jasper:\n", " - filters: ${model.model_defaults.filters}\n", " repeat: 1\n", " kernel:\n", " - 3\n", " stride:\n", " - 1\n", " dilation:\n", " - 1\n", " dropout: 0.0\n", " residual: false\n", " separable: ${model.model_defaults.separable}\n", " se: ${model.model_defaults.se}\n", " se_context_size: ${model.model_defaults.se_context_size}\n", " - filters: ${model.model_defaults.filters}\n", " repeat: ${model.model_defaults.repeat}\n", " kernel:\n", " - 7\n", " stride:\n", " - 1\n", " dilation:\n", " - 1\n", " dropout: ${model.model_defaults.dropout}\n", " residual: true\n", " separable: ${model.model_defaults.separable}\n", " se: ${model.model_defaults.se}\n", " se_context_size: ${model.model_defaults.se_context_size}\n", " - filters: ${model.model_defaults.filters}\n", " repeat: ${model.model_defaults.repeat}\n", " kernel:\n", " - 11\n", " stride:\n", " - 1\n", " dilation:\n", " - 1\n", " dropout: ${model.model_defaults.dropout}\n", " residual: true\n", " separable: ${model.model_defaults.separable}\n", " se: ${model.model_defaults.se}\n", " se_context_size: ${model.model_defaults.se_context_size}\n", " - filters: ${model.model_defaults.filters}\n", " repeat: ${model.model_defaults.repeat}\n", " kernel:\n", " - 15\n", " stride:\n", " - 1\n", " dilation:\n", " - 1\n", " dropout: ${model.model_defaults.dropout}\n", " residual: true\n", " separable: ${model.model_defaults.separable}\n", " se: ${model.model_defaults.se}\n", " se_context_size: ${model.model_defaults.se_context_size}\n", " - filters: 3072\n", " repeat: 1\n", " kernel:\n", " - 1\n", " stride:\n", " - 1\n", " dilation:\n", " - 1\n", " dropout: 0.0\n", " residual: false\n", " separable: ${model.model_defaults.separable}\n", " se: ${model.model_defaults.se}\n", " se_context_size: ${model.model_defaults.se_context_size}\n", " decoder:\n", " _target_: nemo.collections.asr.modules.SpeakerDecoder\n", " feat_in: 3072\n", " num_classes: ???\n", " pool_mode: attention\n", " emb_sizes: 192\n", " loss:\n", " _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss\n", " scale: 30\n", " margin: 0.2\n", " optim_param_groups:\n", " encoder:\n", " lr: 0.001\n", " optim:\n", " name: adamw\n", " lr: 0.0001\n", " weight_decay: 0.0002\n", " sched:\n", " name: CosineAnnealing\n", " warmup_ratio: 0.1\n", " min_lr: 0.0\n", "trainer:\n", " devices: 1\n", " max_epochs: 10\n", " max_steps: -1\n", " num_nodes: 1\n", " accelerator: gpu\n", " strategy: ddp\n", " deterministic: true\n", " enable_checkpointing: false\n", " logger: false\n", " log_every_n_steps: 1\n", " val_check_interval: 1.0\n", " gradient_clip_val: 1.0\n", "exp_manager:\n", " exp_dir: null\n", " name: TitaNet-Finetune\n", " create_tensorboard_logger: true\n", " create_checkpoint_callback: true\n", "\n" ] } ], "source": [ "# Set up the config for fine-tuning\n", "from omegaconf import OmegaConf\n", "finetune_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n", "print(OmegaConf.to_yaml(finetune_config))\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Fine-tune the model with Portuguese language\n", "\n", "import torch\n", "import pytorch_lightning as pl\n", "import nemo\n", "import nemo.collections.asr as nemo_asr\n", "from omegaconf import OmegaConf\n", "from nemo.utils.exp_manager import exp_manager\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pt_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n", "## set up the trainer\n", "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", "\n", "pt_trainer_config = OmegaConf.create(dict(\n", " devices=4,\n", " accelerator=accelerator,\n", " max_epochs=5,\n", " max_steps=-1, # computed at runtime if not set\n", " num_nodes=1,\n", " accumulate_grad_batches=1,\n", " enable_checkpointing=False, # Provided by exp_manager\n", " logger=False, # Provided by exp_manager\n", " log_every_n_steps=1, # Interval of logging.\n", " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n", "))\n", "print(OmegaConf.to_yaml(pt_trainer_config))\n", "pt_trainer_finetune = pl.Trainer(**pt_trainer_config)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#set up the nemo experiment for logging and monitoring purpose\n", "log_dir_finetune = exp_manager(trainer=pt_trainer_finetune, config=pt_config, name='titanet_finetune_pt').get_save_dir()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# set up the manifest file for Portuguese language\n", "pt_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/train.json'\n", "pt_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/dev.json'\n", "pt_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/test.json'\n", "pt_config.model.decoder.num_classes = merged_pt_train_df['label'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up the model for Portuguese language and train the model\n", "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=pt_config.model, trainer=trainer_finetune)\n", "speaker_model.maybe_init_from_pretrained_checkpoint(pt_config)\n", "\n", "pt_trainer_finetune.fit(speaker_model)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save the model after fine-tuning with Portuguese language\n", "speaker_model.save_to('titanet_finetune_pt.nemo')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "devices: 1\n", "accelerator: cpu\n", "max_epochs: 5\n", "max_steps: -1\n", "num_nodes: 1\n", "accumulate_grad_batches: 1\n", "enable_checkpointing: false\n", "logger: false\n", "log_every_n_steps: 1\n", "val_check_interval: 1.0\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "GPU available: False, used: False\n", "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", "HPU available: False, using: 0 HPUs\n", "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-25 05:15:08 exp_manager:381] Experiments will be logged at /User/en_tr_pt_titanet_large/nemo_experiments/TitaNet-Finetune/2023-09-25_04-36-46\n", "[NeMo I 2023-09-25 05:15:08 exp_manager:815] TensorboardLogger has been set up\n", "[NeMo I 2023-09-25 05:15:08 exp_manager:930] Preemption is supported only on GPUs, disabling preemption\n", "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of 29.37 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-25 05:31:31 label_models:187] Total number of 24 found in all the manifest files.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of 29.37 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n", "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10502 items, total duration of 10.23 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:304] # 10502 files loaded accounting to # 128 labels\n", "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10880 items, total duration of 12.25 hours.\n", "[NeMo I 2023-09-25 05:31:31 collections:304] # 10880 files loaded accounting to # 1244 labels\n", "[NeMo I 2023-09-25 05:31:31 features:289] PADDING: 16\n", "[NeMo I 2023-09-25 05:31:32 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n", "[NeMo I 2023-09-25 05:31:38 common:913] Instantiating model from pre-trained checkpoint\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-25 05:31:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n", " Train config : \n", " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 64\n", " shuffle: true\n", " is_tarred: false\n", " tarred_audio_filepaths: null\n", " tarred_shard_strategy: scatter\n", " augmentor:\n", " noise:\n", " manifest_path: /manifests/noise/rir_noise_manifest.json\n", " prob: 0.5\n", " min_snr_db: 0\n", " max_snr_db: 15\n", " speed:\n", " prob: 0.5\n", " sr: 16000\n", " resample_type: kaiser_fast\n", " min_speed_rate: 0.95\n", " max_speed_rate: 1.05\n", " num_workers: 15\n", " pin_memory: true\n", " \n", "[NeMo W 2023-09-25 05:31:38 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n", " Validation config : \n", " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n", " sample_rate: 16000\n", " labels: null\n", " batch_size: 128\n", " shuffle: false\n", " num_workers: 15\n", " pin_memory: true\n", " \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[NeMo I 2023-09-25 05:31:38 features:289] PADDING: 16\n", "[NeMo I 2023-09-25 05:31:39 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n", "[NeMo I 2023-09-25 05:31:39 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n", "[NeMo I 2023-09-25 05:31:39 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n", "[NeMo I 2023-09-25 05:31:39 modelPT:1156] Make sure that this is what you wanted!\n", "[NeMo I 2023-09-25 05:31:39 modelPT:735] Optimizer config = AdamW (\n", " Parameter Group 0\n", " amsgrad: False\n", " betas: (0.9, 0.999)\n", " capturable: False\n", " eps: 1e-08\n", " foreach: None\n", " lr: 0.0001\n", " maximize: False\n", " weight_decay: 0.0002\n", " \n", " Parameter Group 1\n", " amsgrad: False\n", " betas: (0.9, 0.999)\n", " capturable: False\n", " eps: 1e-08\n", " foreach: None\n", " lr: 0.001\n", " maximize: False\n", " weight_decay: 0.0002\n", " )\n", "[NeMo I 2023-09-25 05:31:39 lr_scheduler:910] Scheduler \"\" \n", " will be used during training (effective maximum steps = 2430) - \n", " Parameters : \n", " (warmup_ratio: 0.1\n", " min_lr: 0.0\n", " max_steps: 2430\n", " )\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " | Name | Type | Params\n", "----------------------------------------------------------------------\n", "0 | loss | AngularSoftmaxLoss | 0 \n", "1 | eval_loss | AngularSoftmaxLoss | 0 \n", "2 | _accuracy | TopKClassificationAccuracy | 0 \n", "3 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 \n", "4 | encoder | ConvASREncoder | 19.4 M\n", "5 | decoder | SpeakerDecoder | 2.8 M \n", "6 | _macro_accuracy | MulticlassAccuracy | 0 \n", "----------------------------------------------------------------------\n", "22.1 M Trainable params\n", "0 Non-trainable params\n", "22.1 M Total params\n", "88.508 Total estimated model params size (MB)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8a6fa6c7b4214098b48c00a8562b8051", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Sanity Checking: 0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[NeMo W 2023-09-25 05:31:39 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n", " rank_zero_warn(\n", " \n", "[NeMo E 2023-09-25 05:31:39 segment:249] Loading /User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav via SoundFile raised RuntimeError: `Error opening '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav': System error.`. NeMo will fallback to loading via pydub.\n" ] }, { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[16], line 45\u001b[0m\n\u001b[1;32m 43\u001b[0m speaker_model \u001b[38;5;241m=\u001b[39m nemo_asr\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mEncDecSpeakerLabelModel(cfg\u001b[38;5;241m=\u001b[39mtr_config\u001b[38;5;241m.\u001b[39mmodel, trainer\u001b[38;5;241m=\u001b[39mtr_trainer_finetune)\n\u001b[1;32m 44\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39mmaybe_init_from_pretrained_checkpoint(tr_config)\n\u001b[0;32m---> 45\u001b[0m \u001b[43mtr_trainer_finetune\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mspeaker_model\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Save the model after fine-tuning with Turkish language\u001b[39;00m\n\u001b[1;32m 49\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39msave_to(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitanet_finetune_tr.nemo\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:532\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m_lightning_module \u001b[38;5;241m=\u001b[39m model\n\u001b[1;32m 531\u001b[0m _verify_strategy_supports_compile(model, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy)\n\u001b[0;32m--> 532\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:43\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 46\u001b[0m _call_teardown_hook(trainer)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:571\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_connector\u001b[38;5;241m.\u001b[39mattach_data(\n\u001b[1;32m 562\u001b[0m model, train_dataloaders\u001b[38;5;241m=\u001b[39mtrain_dataloaders, val_dataloaders\u001b[38;5;241m=\u001b[39mval_dataloaders, datamodule\u001b[38;5;241m=\u001b[39mdatamodule\n\u001b[1;32m 563\u001b[0m )\n\u001b[1;32m 565\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 566\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m 567\u001b[0m ckpt_path,\n\u001b[1;32m 568\u001b[0m model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 569\u001b[0m model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 570\u001b[0m )\n\u001b[0;32m--> 571\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m 574\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:980\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m 977\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 978\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m 979\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 980\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 982\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 983\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m 984\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 985\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1021\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1019\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m isolate_rng():\n\u001b[0;32m-> 1021\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sanity_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1022\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit_loop\u001b[38;5;241m.\u001b[39mrun()\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1050\u001b[0m, in \u001b[0;36mTrainer._run_sanity_check\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1047\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_start\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1049\u001b[0m \u001b[38;5;66;03m# run eval step\u001b[39;00m\n\u001b[0;32m-> 1050\u001b[0m \u001b[43mval_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1052\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_end\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:181\u001b[0m, in \u001b[0;36m_no_grad_context.._decorator\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 179\u001b[0m context_manager \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mno_grad\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context_manager():\n\u001b[0;32m--> 181\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloop_run\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/evaluation_loop.py:108\u001b[0m, in \u001b[0;36m_EvaluationLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 108\u001b[0m batch, batch_idx, dataloader_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata_fetcher\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_progress\u001b[38;5;241m.\u001b[39mis_last_batch \u001b[38;5;241m=\u001b[39m data_fetcher\u001b[38;5;241m.\u001b[39mdone\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m previous_dataloader_idx \u001b[38;5;241m!=\u001b[39m dataloader_idx:\n\u001b[1;32m 111\u001b[0m \u001b[38;5;66;03m# the dataloader has changed, notify the logger connector\u001b[39;00m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:137\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone:\n\u001b[1;32m 135\u001b[0m \u001b[38;5;66;03m# this will run only when no pre-fetching was done.\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fetch_next_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataloader_iter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;66;03m# consume the batch we just fetched\u001b[39;00m\n\u001b[1;32m 139\u001b[0m batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatches\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;241m0\u001b[39m)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:151\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher._fetch_next_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_start_profiler()\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop_profiler()\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:285\u001b[0m, in \u001b[0;36mCombinedLoader.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator, _Sequential):\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:123\u001b[0m, in \u001b[0;36m_Sequential.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m\n\u001b[1;32m 122\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 123\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterators\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 670\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m 673\u001b[0m data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/data/audio_to_label.py:327\u001b[0m, in \u001b[0;36m_AudioLabelDataset.__getitem__\u001b[0;34m(self, index)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m offset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 325\u001b[0m offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 327\u001b[0m features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeaturizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 328\u001b[0m f, fl \u001b[38;5;241m=\u001b[39m features, torch\u001b[38;5;241m.\u001b[39mtensor(features\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mlong()\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_regression_task:\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/features.py:186\u001b[0m, in \u001b[0;36mWaveformFeaturizer.process\u001b[0;34m(self, file_path, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess\u001b[39m(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 174\u001b[0m file_path,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 184\u001b[0m normalize_db\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 185\u001b[0m ):\n\u001b[0;32m--> 186\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[43mAudioSegment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample_rate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mint_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_ref\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_ref\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_top_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_top_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_frame_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_frame_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_hop_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_hop_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43morig_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morig_sr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43mchannel_selector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchannel_selector\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mnormalize_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnormalize_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocess_segment(audio)\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/segment.py:259\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, audio_file, target_sr, int_values, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m HAVE_PYDUB \u001b[38;5;129;01mand\u001b[39;00m samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 259\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[43mAudio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 260\u001b[0m sample_rate \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mframe_rate\n\u001b[1;32m 261\u001b[0m num_channels \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mchannels\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/audio_segment.py:651\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, file, format, codec, parameters, start_second, duration, **kwargs)\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 650\u001b[0m filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 651\u001b[0m file, close_file \u001b[38;5;241m=\u001b[39m \u001b[43m_fd_or_path_or_tempfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtempfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mformat\u001b[39m:\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28mformat\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mlower()\n", "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/utils.py:60\u001b[0m, in \u001b[0;36m_fd_or_path_or_tempfile\u001b[0;34m(fd, mode, tempfile)\u001b[0m\n\u001b[1;32m 57\u001b[0m close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(fd, basestring):\n\u001b[0;32m---> 60\u001b[0m fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'" ] } ], "source": [ "# Fine-tune the model with Portuguese language\n", "\n", "import torch\n", "import pytorch_lightning as pl\n", "import nemo\n", "import nemo.collections.asr as nemo_asr\n", "from omegaconf import OmegaConf\n", "from nemo.utils.exp_manager import exp_manager\n", "\n", "# Fine-tune the model with Turkish language\n", "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n", "## set up the trainer\n", "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", "\n", "tr_trainer_config = OmegaConf.create(dict(\n", " devices=4,\n", " accelerator=accelerator,\n", " max_epochs=5,\n", " max_steps=-1, # computed at runtime if not set\n", " num_nodes=1,\n", " accumulate_grad_batches=1,\n", " enable_checkpointing=False, # Provided by exp_manager\n", " logger=False, # Provided by exp_manager\n", " log_every_n_steps=1, # Interval of logging.\n", " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n", "))\n", "print(OmegaConf.to_yaml(tr_trainer_config))\n", "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n", "\n", "\n", "#set up the nemo experiment for logging and monitoring purpose\n", "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n", "\n", "\n", "# set up the manifest file for Turkish language\n", "tr_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/train.json'\n", "tr_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/dev.json'\n", "tr_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/test.json'\n", "tr_config.model.decoder.num_classes = merged_tr_train_df['label'].nunique()\n", "\n", "\n", "# set up the model for Turkish language and train the model\n", "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n", "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n", "tr_trainer_finetune.fit(speaker_model)\n", "\n", "# Save the model after fine-tuning with Turkish language\n", "\n", "speaker_model.save_to('titanet_finetune_tr.nemo')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "Speaker_Recogniton_Verification.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "transcribe", "language": "python", "name": "conda-env-.conda-transcribe-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 4 }