pgwi commited on
Commit
a121bc0
·
1 Parent(s): 8de579a

Delete fine_tune_tianet_tr_pt.ipynb

Browse files
Files changed (1) hide show
  1. fine_tune_tianet_tr_pt.ipynb +0 -923
fine_tune_tianet_tr_pt.ipynb DELETED
@@ -1,923 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {
7
- "colab": {},
8
- "colab_type": "code",
9
- "id": "iyLoWDsb9rEs"
10
- },
11
- "outputs": [],
12
- "source": [
13
- "# unzip the audio files from commom voice dataset with Turkish language and Portuguese language\n",
14
- "#! tar -xf data/cv-corpus-15.0-2023-09-08-pt.tar.gz\n",
15
- "#! tar -xf data/cv-corpus-15.0-2023-09-08-tr.tar.gz"
16
- ]
17
- },
18
- {
19
- "cell_type": "code",
20
- "execution_count": 2,
21
- "metadata": {},
22
- "outputs": [
23
- {
24
- "name": "stdout",
25
- "output_type": "stream",
26
- "text": [
27
- "/User/en_tr_pt_titanet_large\n"
28
- ]
29
- }
30
- ],
31
- "source": [
32
- "# Convert the mp3 files to wav files with 16kHz sampling rate and 16 bits, 1 channel\n",
33
- "import os\n",
34
- "NEMO_ROOT = os.getcwd()\n",
35
- "print(NEMO_ROOT)\n",
36
- "import glob\n",
37
- "import subprocess\n",
38
- "\n",
39
- "data_dir = os.path.join(NEMO_ROOT,'data')\n",
40
- "#os.makedirs(data_dir, exist_ok=True)\n",
41
- "\n",
42
- "#print(\"Converting .mp3 to .wav...\")\n",
43
- "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/pt/clips/*.mp3', recursive=True)\n",
44
- "#for mp3_path in mp3_list:\n",
45
- "# wav_path = mp3_path[:-4] + '.wav'\n",
46
- "# cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n",
47
- "# subprocess.run(cmd)\n",
48
- "#print(\"Finished conversion.\\n******\")"
49
- ]
50
- },
51
- {
52
- "cell_type": "code",
53
- "execution_count": 3,
54
- "metadata": {},
55
- "outputs": [],
56
- "source": [
57
- "#print(\"Converting .mp3 to .wav...\")\n",
58
- "#mp3_list = glob.glob(data_dir + '/cv-corpus-15.0-2023-09-08/tr/clips/*.mp3', recursive=True)\n",
59
- "#for mp3_path in mp3_list:\n",
60
- "# wav_path = mp3_path[:-4] + '.wav'\n",
61
- "# cmd = [\"sox\", mp3_path, \"--rate\", \"16k\", \"--bits\", \"16\", \"--channels\", \"1\", wav_path]\n",
62
- "# subprocess.run(cmd)\n",
63
- "#print(\"Finished conversion.\\n******\")"
64
- ]
65
- },
66
- {
67
- "cell_type": "code",
68
- "execution_count": 1,
69
- "metadata": {
70
- "colab": {},
71
- "colab_type": "code",
72
- "id": "vqUBayc_Ctcr"
73
- },
74
- "outputs": [],
75
- "source": [
76
- "# prepare the train, dev, test dataset for Portuguese language\n",
77
- "import pandas as pd\n",
78
- "import os\n",
79
- "\n",
80
- "#pt_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/clip_durations.tsv', sep='\\t')\n",
81
- "#pt_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/train.tsv', sep='\\t')\n",
82
- "#pt_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/dev.tsv', sep='\\t')\n",
83
- "#pt_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/pt/test.tsv', sep='\\t')\n",
84
- "\n",
85
- "#merged_pt_train_df = pd.merge(pt_train_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
86
- "#merged_pt_dev_df = pd.merge(pt_dev_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
87
- "#merged_pt_test_df = pd.merge(pt_test_df, pt_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
88
- ]
89
- },
90
- {
91
- "cell_type": "code",
92
- "execution_count": 6,
93
- "metadata": {},
94
- "outputs": [],
95
- "source": [
96
- "#merged_pt_train_df['audio_filepath'] = merged_pt_train_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
97
- "#merged_pt_dev_df['audio_filepath'] = merged_pt_dev_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
98
- "#merged_pt_test_df['audio_filepath'] = merged_pt_test_df['path'].apply(lambda x: os.path.join('/Users/Peng_Wei/work/mlrun_related/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/pt/clips', x))\n",
99
- "\n",
100
- "#merged_pt_train_df[\"audio_filepath\"] = merged_pt_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
101
- "#merged_pt_dev_df[\"audio_filepath\"] = merged_pt_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
102
- "#merged_pt_test_df[\"audio_filepath\"] = merged_pt_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
103
- "\n",
104
- "#merged_pt_train_df['duration'] = merged_pt_train_df['duration'].apply(lambda x: x / 1000)\n",
105
- "#merged_pt_dev_df['duration'] = merged_pt_dev_df['duration'].apply(lambda x: x / 1000)\n",
106
- "#merged_pt_test_df['duration'] = merged_pt_test_df['duration'].apply(lambda x: x / 1000)\n",
107
- "\n",
108
- "#merged_pt_train_df = merged_pt_train_df[['audio_filepath', 'duration', 'label']]\n",
109
- "#merged_pt_dev_df = merged_pt_dev_df[['audio_filepath', 'duration', 'label']]\n",
110
- "#merged_pt_test_df = merged_pt_test_df[['audio_filepath', 'duration', 'label']]"
111
- ]
112
- },
113
- {
114
- "cell_type": "code",
115
- "execution_count": 7,
116
- "metadata": {},
117
- "outputs": [
118
- {
119
- "data": {
120
- "text/html": [
121
- "<div>\n",
122
- "<style scoped>\n",
123
- " .dataframe tbody tr th:only-of-type {\n",
124
- " vertical-align: middle;\n",
125
- " }\n",
126
- "\n",
127
- " .dataframe tbody tr th {\n",
128
- " vertical-align: top;\n",
129
- " }\n",
130
- "\n",
131
- " .dataframe thead th {\n",
132
- " text-align: right;\n",
133
- " }\n",
134
- "</style>\n",
135
- "<table border=\"1\" class=\"dataframe\">\n",
136
- " <thead>\n",
137
- " <tr style=\"text-align: right;\">\n",
138
- " <th></th>\n",
139
- " <th>audio_filepath</th>\n",
140
- " <th>duration</th>\n",
141
- " <th>label</th>\n",
142
- " </tr>\n",
143
- " </thead>\n",
144
- " <tbody>\n",
145
- " <tr>\n",
146
- " <th>0</th>\n",
147
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
148
- " <td>6.504</td>\n",
149
- " <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
150
- " </tr>\n",
151
- " <tr>\n",
152
- " <th>1</th>\n",
153
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
154
- " <td>4.656</td>\n",
155
- " <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
156
- " </tr>\n",
157
- " <tr>\n",
158
- " <th>2</th>\n",
159
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
160
- " <td>3.504</td>\n",
161
- " <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
162
- " </tr>\n",
163
- " <tr>\n",
164
- " <th>3</th>\n",
165
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
166
- " <td>3.456</td>\n",
167
- " <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
168
- " </tr>\n",
169
- " <tr>\n",
170
- " <th>4</th>\n",
171
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
172
- " <td>4.224</td>\n",
173
- " <td>c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8...</td>\n",
174
- " </tr>\n",
175
- " <tr>\n",
176
- " <th>...</th>\n",
177
- " <td>...</td>\n",
178
- " <td>...</td>\n",
179
- " <td>...</td>\n",
180
- " </tr>\n",
181
- " <tr>\n",
182
- " <th>21052</th>\n",
183
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
184
- " <td>4.860</td>\n",
185
- " <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
186
- " </tr>\n",
187
- " <tr>\n",
188
- " <th>21053</th>\n",
189
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
190
- " <td>2.196</td>\n",
191
- " <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
192
- " </tr>\n",
193
- " <tr>\n",
194
- " <th>21054</th>\n",
195
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
196
- " <td>2.124</td>\n",
197
- " <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
198
- " </tr>\n",
199
- " <tr>\n",
200
- " <th>21055</th>\n",
201
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
202
- " <td>1.908</td>\n",
203
- " <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
204
- " </tr>\n",
205
- " <tr>\n",
206
- " <th>21056</th>\n",
207
- " <td>/Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti...</td>\n",
208
- " <td>5.436</td>\n",
209
- " <td>d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392...</td>\n",
210
- " </tr>\n",
211
- " </tbody>\n",
212
- "</table>\n",
213
- "<p>21057 rows × 3 columns</p>\n",
214
- "</div>"
215
- ],
216
- "text/plain": [
217
- " audio_filepath duration \\\n",
218
- "0 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 6.504 \n",
219
- "1 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.656 \n",
220
- "2 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 3.504 \n",
221
- "3 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 3.456 \n",
222
- "4 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.224 \n",
223
- "... ... ... \n",
224
- "21052 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 4.860 \n",
225
- "21053 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 2.196 \n",
226
- "21054 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 2.124 \n",
227
- "21055 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 1.908 \n",
228
- "21056 /Users/Peng_Wei/work/mlrun_related/en_tr_pt_ti... 5.436 \n",
229
- "\n",
230
- " label \n",
231
- "0 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n",
232
- "1 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n",
233
- "2 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n",
234
- "3 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n",
235
- "4 c1b7c535717cd09b0e3e9de74b0382d810b266e47091d8... \n",
236
- "... ... \n",
237
- "21052 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n",
238
- "21053 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n",
239
- "21054 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n",
240
- "21055 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n",
241
- "21056 d8288aee86a2a6a3ab6f3e8d4028ef097b51698b2f7392... \n",
242
- "\n",
243
- "[21057 rows x 3 columns]"
244
- ]
245
- },
246
- "execution_count": 7,
247
- "metadata": {},
248
- "output_type": "execute_result"
249
- }
250
- ],
251
- "source": [
252
- "#merged_pt_train_df"
253
- ]
254
- },
255
- {
256
- "cell_type": "code",
257
- "execution_count": 4,
258
- "metadata": {
259
- "colab": {},
260
- "colab_type": "code",
261
- "id": "vnrUh3vuDSRN"
262
- },
263
- "outputs": [],
264
- "source": [
265
- "import pandas as pd\n",
266
- "import os\n",
267
- "# prepare the train, dev, test dataset for Turkish language\n",
268
- "tr_duration_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/clip_durations.tsv', sep='\\t')\n",
269
- "tr_train_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/train.tsv', sep='\\t')\n",
270
- "tr_dev_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/dev.tsv', sep='\\t')\n",
271
- "tr_test_df = pd.read_csv('data/cv-corpus-15.0-2023-09-08/tr/test.tsv', sep='\\t')\n",
272
- "\n",
273
- "merged_tr_train_df = pd.merge(tr_train_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
274
- "merged_tr_dev_df = pd.merge(tr_dev_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})\n",
275
- "merged_tr_test_df = pd.merge(tr_test_df, tr_duration_df, left_on='path', right_on='clip', how='left')[['path', 'duration[ms]', 'client_id']].rename(columns={'duration[ms]': 'duration', 'client_id': 'label'})"
276
- ]
277
- },
278
- {
279
- "cell_type": "code",
280
- "execution_count": 5,
281
- "metadata": {},
282
- "outputs": [
283
- {
284
- "name": "stderr",
285
- "output_type": "stream",
286
- "text": [
287
- "<ipython-input-5-81ac8797cb7a>:5: FutureWarning: The default value of regex will change from True to False in a future version.\n",
288
- " merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
289
- "<ipython-input-5-81ac8797cb7a>:6: FutureWarning: The default value of regex will change from True to False in a future version.\n",
290
- " merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
291
- "<ipython-input-5-81ac8797cb7a>:7: FutureWarning: The default value of regex will change from True to False in a future version.\n",
292
- " merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n"
293
- ]
294
- }
295
- ],
296
- "source": [
297
- "\n",
298
- "merged_tr_train_df['audio_filepath'] = merged_tr_train_df['path'].apply(lambda x: os.path.join('/Users/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
299
- "merged_tr_dev_df['audio_filepath'] = merged_tr_dev_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
300
- "merged_tr_test_df['audio_filepath'] = merged_tr_test_df['path'].apply(lambda x: os.path.join('/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips', x))\n",
301
- "\n",
302
- "merged_tr_train_df[\"audio_filepath\"] = merged_tr_train_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
303
- "merged_tr_dev_df[\"audio_filepath\"] = merged_tr_dev_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
304
- "merged_tr_test_df[\"audio_filepath\"] = merged_tr_test_df[\"audio_filepath\"].str.replace(\".mp3\", \".wav\")\n",
305
- "\n",
306
- "merged_tr_train_df['duration'] = merged_tr_train_df['duration'].apply(lambda x: x / 1000)\n",
307
- "merged_tr_dev_df['duration'] = merged_tr_dev_df['duration'].apply(lambda x: x / 1000)\n",
308
- "merged_tr_test_df['duration'] = merged_tr_test_df['duration'].apply(lambda x: x / 1000)\n",
309
- "\n",
310
- "merged_tr_train_df = merged_tr_train_df[['audio_filepath', 'duration', 'label']]\n",
311
- "merged_tr_dev_df = merged_tr_dev_df[['audio_filepath', 'duration', 'label']]\n",
312
- "merged_tr_test_df = merged_tr_test_df[['audio_filepath', 'duration', 'label']]"
313
- ]
314
- },
315
- {
316
- "cell_type": "code",
317
- "execution_count": 7,
318
- "metadata": {},
319
- "outputs": [],
320
- "source": [
321
- "merged_tr_train_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/train.json', orient='records', lines=True)\n",
322
- "merged_tr_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/dev.json', orient='records', lines=True)\n",
323
- "merged_tr_test_df.to_json('data/cv-corpus-15.0-2023-09-08/tr/test.json', orient='records', lines=True)\n",
324
- "\n",
325
- "#merged_pt_train_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/train.json', orient='records', lines=True)\n",
326
- "#merged_pt_dev_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/dev.json', orient='records', lines=True)\n",
327
- "#merged_pt_test_df.to_json('data/cv-corpus-15.0-2023-09-08/pt/test.json', orient='records', lines=True)\n"
328
- ]
329
- },
330
- {
331
- "cell_type": "code",
332
- "execution_count": 8,
333
- "metadata": {},
334
- "outputs": [
335
- {
336
- "name": "stdout",
337
- "output_type": "stream",
338
- "text": [
339
- "name: TitaNet-Finetune\n",
340
- "sample_rate: 16000\n",
341
- "init_from_pretrained_model:\n",
342
- " speaker_tasks:\n",
343
- " name: titanet_large\n",
344
- " include:\n",
345
- " - preprocessor\n",
346
- " - encoder\n",
347
- " exclude:\n",
348
- " - decoder.final\n",
349
- "model:\n",
350
- " train_ds:\n",
351
- " manifest_filepath: ???\n",
352
- " sample_rate: 16000\n",
353
- " labels: null\n",
354
- " batch_size: 64\n",
355
- " shuffle: true\n",
356
- " is_tarred: false\n",
357
- " tarred_audio_filepaths: null\n",
358
- " tarred_shard_strategy: scatter\n",
359
- " augmentor:\n",
360
- " speed:\n",
361
- " prob: 0.3\n",
362
- " sr: 16000\n",
363
- " resample_type: kaiser_fast\n",
364
- " min_speed_rate: 0.95\n",
365
- " max_speed_rate: 1.05\n",
366
- " validation_ds:\n",
367
- " manifest_filepath: ???\n",
368
- " sample_rate: 16000\n",
369
- " labels: null\n",
370
- " batch_size: 128\n",
371
- " shuffle: false\n",
372
- " test_ds:\n",
373
- " manifest_filepath: ???\n",
374
- " sample_rate: 16000\n",
375
- " labels: null\n",
376
- " batch_size: 1\n",
377
- " shuffle: false\n",
378
- " embedding_dir: ./embeddings\n",
379
- " model_defaults:\n",
380
- " filters: 1024\n",
381
- " repeat: 3\n",
382
- " dropout: 0.1\n",
383
- " separable: true\n",
384
- " se: true\n",
385
- " se_context_size: -1\n",
386
- " kernel_size_factor: 1.0\n",
387
- " preprocessor:\n",
388
- " _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor\n",
389
- " normalize: per_feature\n",
390
- " window_size: 0.025\n",
391
- " sample_rate: 16000\n",
392
- " window_stride: 0.01\n",
393
- " window: hann\n",
394
- " features: 80\n",
395
- " n_fft: 512\n",
396
- " frame_splicing: 1\n",
397
- " dither: 1.0e-05\n",
398
- " encoder:\n",
399
- " _target_: nemo.collections.asr.modules.ConvASREncoder\n",
400
- " feat_in: 80\n",
401
- " activation: relu\n",
402
- " conv_mask: true\n",
403
- " jasper:\n",
404
- " - filters: ${model.model_defaults.filters}\n",
405
- " repeat: 1\n",
406
- " kernel:\n",
407
- " - 3\n",
408
- " stride:\n",
409
- " - 1\n",
410
- " dilation:\n",
411
- " - 1\n",
412
- " dropout: 0.0\n",
413
- " residual: false\n",
414
- " separable: ${model.model_defaults.separable}\n",
415
- " se: ${model.model_defaults.se}\n",
416
- " se_context_size: ${model.model_defaults.se_context_size}\n",
417
- " - filters: ${model.model_defaults.filters}\n",
418
- " repeat: ${model.model_defaults.repeat}\n",
419
- " kernel:\n",
420
- " - 7\n",
421
- " stride:\n",
422
- " - 1\n",
423
- " dilation:\n",
424
- " - 1\n",
425
- " dropout: ${model.model_defaults.dropout}\n",
426
- " residual: true\n",
427
- " separable: ${model.model_defaults.separable}\n",
428
- " se: ${model.model_defaults.se}\n",
429
- " se_context_size: ${model.model_defaults.se_context_size}\n",
430
- " - filters: ${model.model_defaults.filters}\n",
431
- " repeat: ${model.model_defaults.repeat}\n",
432
- " kernel:\n",
433
- " - 11\n",
434
- " stride:\n",
435
- " - 1\n",
436
- " dilation:\n",
437
- " - 1\n",
438
- " dropout: ${model.model_defaults.dropout}\n",
439
- " residual: true\n",
440
- " separable: ${model.model_defaults.separable}\n",
441
- " se: ${model.model_defaults.se}\n",
442
- " se_context_size: ${model.model_defaults.se_context_size}\n",
443
- " - filters: ${model.model_defaults.filters}\n",
444
- " repeat: ${model.model_defaults.repeat}\n",
445
- " kernel:\n",
446
- " - 15\n",
447
- " stride:\n",
448
- " - 1\n",
449
- " dilation:\n",
450
- " - 1\n",
451
- " dropout: ${model.model_defaults.dropout}\n",
452
- " residual: true\n",
453
- " separable: ${model.model_defaults.separable}\n",
454
- " se: ${model.model_defaults.se}\n",
455
- " se_context_size: ${model.model_defaults.se_context_size}\n",
456
- " - filters: 3072\n",
457
- " repeat: 1\n",
458
- " kernel:\n",
459
- " - 1\n",
460
- " stride:\n",
461
- " - 1\n",
462
- " dilation:\n",
463
- " - 1\n",
464
- " dropout: 0.0\n",
465
- " residual: false\n",
466
- " separable: ${model.model_defaults.separable}\n",
467
- " se: ${model.model_defaults.se}\n",
468
- " se_context_size: ${model.model_defaults.se_context_size}\n",
469
- " decoder:\n",
470
- " _target_: nemo.collections.asr.modules.SpeakerDecoder\n",
471
- " feat_in: 3072\n",
472
- " num_classes: ???\n",
473
- " pool_mode: attention\n",
474
- " emb_sizes: 192\n",
475
- " loss:\n",
476
- " _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss\n",
477
- " scale: 30\n",
478
- " margin: 0.2\n",
479
- " optim_param_groups:\n",
480
- " encoder:\n",
481
- " lr: 0.001\n",
482
- " optim:\n",
483
- " name: adamw\n",
484
- " lr: 0.0001\n",
485
- " weight_decay: 0.0002\n",
486
- " sched:\n",
487
- " name: CosineAnnealing\n",
488
- " warmup_ratio: 0.1\n",
489
- " min_lr: 0.0\n",
490
- "trainer:\n",
491
- " devices: 1\n",
492
- " max_epochs: 10\n",
493
- " max_steps: -1\n",
494
- " num_nodes: 1\n",
495
- " accelerator: gpu\n",
496
- " strategy: ddp\n",
497
- " deterministic: true\n",
498
- " enable_checkpointing: false\n",
499
- " logger: false\n",
500
- " log_every_n_steps: 1\n",
501
- " val_check_interval: 1.0\n",
502
- " gradient_clip_val: 1.0\n",
503
- "exp_manager:\n",
504
- " exp_dir: null\n",
505
- " name: TitaNet-Finetune\n",
506
- " create_tensorboard_logger: true\n",
507
- " create_checkpoint_callback: true\n",
508
- "\n"
509
- ]
510
- }
511
- ],
512
- "source": [
513
- "# Set up the config for fine-tuning\n",
514
- "from omegaconf import OmegaConf\n",
515
- "finetune_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
516
- "print(OmegaConf.to_yaml(finetune_config))\n"
517
- ]
518
- },
519
- {
520
- "cell_type": "code",
521
- "execution_count": 2,
522
- "metadata": {},
523
- "outputs": [],
524
- "source": [
525
- "# Fine-tune the model with Portuguese language\n",
526
- "\n",
527
- "import torch\n",
528
- "import pytorch_lightning as pl\n",
529
- "import nemo\n",
530
- "import nemo.collections.asr as nemo_asr\n",
531
- "from omegaconf import OmegaConf\n",
532
- "from nemo.utils.exp_manager import exp_manager\n"
533
- ]
534
- },
535
- {
536
- "cell_type": "code",
537
- "execution_count": 4,
538
- "metadata": {},
539
- "outputs": [],
540
- "source": [
541
- "pt_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
542
- "## set up the trainer\n",
543
- "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
544
- "\n",
545
- "pt_trainer_config = OmegaConf.create(dict(\n",
546
- " devices=4,\n",
547
- " accelerator=accelerator,\n",
548
- " max_epochs=5,\n",
549
- " max_steps=-1, # computed at runtime if not set\n",
550
- " num_nodes=1,\n",
551
- " accumulate_grad_batches=1,\n",
552
- " enable_checkpointing=False, # Provided by exp_manager\n",
553
- " logger=False, # Provided by exp_manager\n",
554
- " log_every_n_steps=1, # Interval of logging.\n",
555
- " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
556
- "))\n",
557
- "print(OmegaConf.to_yaml(pt_trainer_config))\n",
558
- "pt_trainer_finetune = pl.Trainer(**pt_trainer_config)"
559
- ]
560
- },
561
- {
562
- "cell_type": "code",
563
- "execution_count": null,
564
- "metadata": {},
565
- "outputs": [],
566
- "source": [
567
- "#set up the nemo experiment for logging and monitoring purpose\n",
568
- "log_dir_finetune = exp_manager(trainer=pt_trainer_finetune, config=pt_config, name='titanet_finetune_pt').get_save_dir()"
569
- ]
570
- },
571
- {
572
- "cell_type": "code",
573
- "execution_count": 8,
574
- "metadata": {},
575
- "outputs": [],
576
- "source": [
577
- "# set up the manifest file for Portuguese language\n",
578
- "pt_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/train.json'\n",
579
- "pt_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/dev.json'\n",
580
- "pt_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/pt/test.json'\n",
581
- "pt_config.model.decoder.num_classes = merged_pt_train_df['label'].nunique()"
582
- ]
583
- },
584
- {
585
- "cell_type": "code",
586
- "execution_count": null,
587
- "metadata": {},
588
- "outputs": [],
589
- "source": [
590
- "# set up the model for Portuguese language and train the model\n",
591
- "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=pt_config.model, trainer=trainer_finetune)\n",
592
- "speaker_model.maybe_init_from_pretrained_checkpoint(pt_config)\n",
593
- "\n",
594
- "pt_trainer_finetune.fit(speaker_model)\n"
595
- ]
596
- },
597
- {
598
- "cell_type": "code",
599
- "execution_count": null,
600
- "metadata": {},
601
- "outputs": [],
602
- "source": [
603
- "# Save the model after fine-tuning with Portuguese language\n",
604
- "speaker_model.save_to('titanet_finetune_pt.nemo')"
605
- ]
606
- },
607
- {
608
- "cell_type": "code",
609
- "execution_count": 16,
610
- "metadata": {},
611
- "outputs": [
612
- {
613
- "name": "stdout",
614
- "output_type": "stream",
615
- "text": [
616
- "devices: 1\n",
617
- "accelerator: cpu\n",
618
- "max_epochs: 5\n",
619
- "max_steps: -1\n",
620
- "num_nodes: 1\n",
621
- "accumulate_grad_batches: 1\n",
622
- "enable_checkpointing: false\n",
623
- "logger: false\n",
624
- "log_every_n_steps: 1\n",
625
- "val_check_interval: 1.0\n",
626
- "\n"
627
- ]
628
- },
629
- {
630
- "name": "stderr",
631
- "output_type": "stream",
632
- "text": [
633
- "GPU available: False, used: False\n",
634
- "TPU available: False, using: 0 TPU cores\n",
635
- "IPU available: False, using: 0 IPUs\n",
636
- "HPU available: False, using: 0 HPUs\n",
637
- "`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..\n"
638
- ]
639
- },
640
- {
641
- "name": "stdout",
642
- "output_type": "stream",
643
- "text": [
644
- "[NeMo I 2023-09-25 05:15:08 exp_manager:381] Experiments will be logged at /User/en_tr_pt_titanet_large/nemo_experiments/TitaNet-Finetune/2023-09-25_04-36-46\n",
645
- "[NeMo I 2023-09-25 05:15:08 exp_manager:815] TensorboardLogger has been set up\n",
646
- "[NeMo I 2023-09-25 05:15:08 exp_manager:930] Preemption is supported only on GPUs, disabling preemption\n",
647
- "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
648
- "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of 29.37 hours.\n",
649
- "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n"
650
- ]
651
- },
652
- {
653
- "name": "stderr",
654
- "output_type": "stream",
655
- "text": [
656
- "[NeMo W 2023-09-25 05:31:31 label_models:187] Total number of 24 found in all the manifest files.\n"
657
- ]
658
- },
659
- {
660
- "name": "stdout",
661
- "output_type": "stream",
662
- "text": [
663
- "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
664
- "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 31094 items, total duration of 29.37 hours.\n",
665
- "[NeMo I 2023-09-25 05:31:31 collections:304] # 31094 files loaded accounting to # 24 labels\n",
666
- "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
667
- "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10502 items, total duration of 10.23 hours.\n",
668
- "[NeMo I 2023-09-25 05:31:31 collections:304] # 10502 files loaded accounting to # 128 labels\n",
669
- "[NeMo I 2023-09-25 05:31:31 collections:301] Filtered duration for loading collection is 0.00 hours.\n",
670
- "[NeMo I 2023-09-25 05:31:31 collections:302] Dataset loaded with 10880 items, total duration of 12.25 hours.\n",
671
- "[NeMo I 2023-09-25 05:31:31 collections:304] # 10880 files loaded accounting to # 1244 labels\n",
672
- "[NeMo I 2023-09-25 05:31:31 features:289] PADDING: 16\n",
673
- "[NeMo I 2023-09-25 05:31:32 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo\n",
674
- "[NeMo I 2023-09-25 05:31:38 common:913] Instantiating model from pre-trained checkpoint\n"
675
- ]
676
- },
677
- {
678
- "name": "stderr",
679
- "output_type": "stream",
680
- "text": [
681
- "[NeMo W 2023-09-25 05:31:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
682
- " Train config : \n",
683
- " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json\n",
684
- " sample_rate: 16000\n",
685
- " labels: null\n",
686
- " batch_size: 64\n",
687
- " shuffle: true\n",
688
- " is_tarred: false\n",
689
- " tarred_audio_filepaths: null\n",
690
- " tarred_shard_strategy: scatter\n",
691
- " augmentor:\n",
692
- " noise:\n",
693
- " manifest_path: /manifests/noise/rir_noise_manifest.json\n",
694
- " prob: 0.5\n",
695
- " min_snr_db: 0\n",
696
- " max_snr_db: 15\n",
697
- " speed:\n",
698
- " prob: 0.5\n",
699
- " sr: 16000\n",
700
- " resample_type: kaiser_fast\n",
701
- " min_speed_rate: 0.95\n",
702
- " max_speed_rate: 1.05\n",
703
- " num_workers: 15\n",
704
- " pin_memory: true\n",
705
- " \n",
706
- "[NeMo W 2023-09-25 05:31:38 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
707
- " Validation config : \n",
708
- " manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/dev.json\n",
709
- " sample_rate: 16000\n",
710
- " labels: null\n",
711
- " batch_size: 128\n",
712
- " shuffle: false\n",
713
- " num_workers: 15\n",
714
- " pin_memory: true\n",
715
- " \n"
716
- ]
717
- },
718
- {
719
- "name": "stdout",
720
- "output_type": "stream",
721
- "text": [
722
- "[NeMo I 2023-09-25 05:31:38 features:289] PADDING: 16\n",
723
- "[NeMo I 2023-09-25 05:31:39 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /User/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.\n",
724
- "[NeMo I 2023-09-25 05:31:39 modelPT:1151] Model checkpoint partially restored from pretrained checkpoint with name `titanet_large`\n",
725
- "[NeMo I 2023-09-25 05:31:39 modelPT:1153] The following parameters were excluded when loading from pretrained checkpoint with name `titanet_large` : ['decoder.final.weight']\n",
726
- "[NeMo I 2023-09-25 05:31:39 modelPT:1156] Make sure that this is what you wanted!\n",
727
- "[NeMo I 2023-09-25 05:31:39 modelPT:735] Optimizer config = AdamW (\n",
728
- " Parameter Group 0\n",
729
- " amsgrad: False\n",
730
- " betas: (0.9, 0.999)\n",
731
- " capturable: False\n",
732
- " eps: 1e-08\n",
733
- " foreach: None\n",
734
- " lr: 0.0001\n",
735
- " maximize: False\n",
736
- " weight_decay: 0.0002\n",
737
- " \n",
738
- " Parameter Group 1\n",
739
- " amsgrad: False\n",
740
- " betas: (0.9, 0.999)\n",
741
- " capturable: False\n",
742
- " eps: 1e-08\n",
743
- " foreach: None\n",
744
- " lr: 0.001\n",
745
- " maximize: False\n",
746
- " weight_decay: 0.0002\n",
747
- " )\n",
748
- "[NeMo I 2023-09-25 05:31:39 lr_scheduler:910] Scheduler \"<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fc250660ac0>\" \n",
749
- " will be used during training (effective maximum steps = 2430) - \n",
750
- " Parameters : \n",
751
- " (warmup_ratio: 0.1\n",
752
- " min_lr: 0.0\n",
753
- " max_steps: 2430\n",
754
- " )\n"
755
- ]
756
- },
757
- {
758
- "name": "stderr",
759
- "output_type": "stream",
760
- "text": [
761
- "\n",
762
- " | Name | Type | Params\n",
763
- "----------------------------------------------------------------------\n",
764
- "0 | loss | AngularSoftmaxLoss | 0 \n",
765
- "1 | eval_loss | AngularSoftmaxLoss | 0 \n",
766
- "2 | _accuracy | TopKClassificationAccuracy | 0 \n",
767
- "3 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 \n",
768
- "4 | encoder | ConvASREncoder | 19.4 M\n",
769
- "5 | decoder | SpeakerDecoder | 2.8 M \n",
770
- "6 | _macro_accuracy | MulticlassAccuracy | 0 \n",
771
- "----------------------------------------------------------------------\n",
772
- "22.1 M Trainable params\n",
773
- "0 Non-trainable params\n",
774
- "22.1 M Total params\n",
775
- "88.508 Total estimated model params size (MB)\n"
776
- ]
777
- },
778
- {
779
- "data": {
780
- "application/vnd.jupyter.widget-view+json": {
781
- "model_id": "8a6fa6c7b4214098b48c00a8562b8051",
782
- "version_major": 2,
783
- "version_minor": 0
784
- },
785
- "text/plain": [
786
- "Sanity Checking: 0it [00:00, ?it/s]"
787
- ]
788
- },
789
- "metadata": {},
790
- "output_type": "display_data"
791
- },
792
- {
793
- "name": "stderr",
794
- "output_type": "stream",
795
- "text": [
796
- "[NeMo W 2023-09-25 05:31:39 nemo_logging:349] /User/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:438: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 16 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
797
- " rank_zero_warn(\n",
798
- " \n",
799
- "[NeMo E 2023-09-25 05:31:39 segment:249] Loading /User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav via SoundFile raised RuntimeError: `Error opening '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav': System error.`. NeMo will fallback to loading via pydub.\n"
800
- ]
801
- },
802
- {
803
- "ename": "FileNotFoundError",
804
- "evalue": "[Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'",
805
- "output_type": "error",
806
- "traceback": [
807
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
808
- "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
809
- "Cell \u001b[0;32mIn[16], line 45\u001b[0m\n\u001b[1;32m 43\u001b[0m speaker_model \u001b[38;5;241m=\u001b[39m nemo_asr\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mEncDecSpeakerLabelModel(cfg\u001b[38;5;241m=\u001b[39mtr_config\u001b[38;5;241m.\u001b[39mmodel, trainer\u001b[38;5;241m=\u001b[39mtr_trainer_finetune)\n\u001b[1;32m 44\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39mmaybe_init_from_pretrained_checkpoint(tr_config)\n\u001b[0;32m---> 45\u001b[0m \u001b[43mtr_trainer_finetune\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mspeaker_model\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;66;03m# Save the model after fine-tuning with Turkish language\u001b[39;00m\n\u001b[1;32m 49\u001b[0m speaker_model\u001b[38;5;241m.\u001b[39msave_to(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtitanet_finetune_tr.nemo\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
810
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:532\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m_lightning_module \u001b[38;5;241m=\u001b[39m model\n\u001b[1;32m 531\u001b[0m _verify_strategy_supports_compile(model, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy)\n\u001b[0;32m--> 532\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
811
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:43\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 46\u001b[0m _call_teardown_hook(trainer)\n",
812
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:571\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_connector\u001b[38;5;241m.\u001b[39mattach_data(\n\u001b[1;32m 562\u001b[0m model, train_dataloaders\u001b[38;5;241m=\u001b[39mtrain_dataloaders, val_dataloaders\u001b[38;5;241m=\u001b[39mval_dataloaders, datamodule\u001b[38;5;241m=\u001b[39mdatamodule\n\u001b[1;32m 563\u001b[0m )\n\u001b[1;32m 565\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 566\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m 567\u001b[0m ckpt_path,\n\u001b[1;32m 568\u001b[0m model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 569\u001b[0m model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 570\u001b[0m )\n\u001b[0;32m--> 571\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 573\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m 574\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
813
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:980\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m 977\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 978\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m 979\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 980\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 982\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 983\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m 984\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m 985\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
814
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1021\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1019\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m isolate_rng():\n\u001b[0;32m-> 1021\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sanity_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1022\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit_loop\u001b[38;5;241m.\u001b[39mrun()\n",
815
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1050\u001b[0m, in \u001b[0;36mTrainer._run_sanity_check\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1047\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_start\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1049\u001b[0m \u001b[38;5;66;03m# run eval step\u001b[39;00m\n\u001b[0;32m-> 1050\u001b[0m \u001b[43mval_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1052\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_end\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n",
816
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:181\u001b[0m, in \u001b[0;36m_no_grad_context.<locals>._decorator\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 179\u001b[0m context_manager \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mno_grad\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context_manager():\n\u001b[0;32m--> 181\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloop_run\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
817
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/evaluation_loop.py:108\u001b[0m, in \u001b[0;36m_EvaluationLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 108\u001b[0m batch, batch_idx, dataloader_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata_fetcher\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_progress\u001b[38;5;241m.\u001b[39mis_last_batch \u001b[38;5;241m=\u001b[39m data_fetcher\u001b[38;5;241m.\u001b[39mdone\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m previous_dataloader_idx \u001b[38;5;241m!=\u001b[39m dataloader_idx:\n\u001b[1;32m 111\u001b[0m \u001b[38;5;66;03m# the dataloader has changed, notify the logger connector\u001b[39;00m\n",
818
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:137\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone:\n\u001b[1;32m 135\u001b[0m \u001b[38;5;66;03m# this will run only when no pre-fetching was done.\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fetch_next_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataloader_iter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;66;03m# consume the batch we just fetched\u001b[39;00m\n\u001b[1;32m 139\u001b[0m batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatches\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;241m0\u001b[39m)\n",
819
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/loops/fetchers.py:151\u001b[0m, in \u001b[0;36m_PrefetchDataFetcher._fetch_next_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_start_profiler()\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 151\u001b[0m batch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop_profiler()\n",
820
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:285\u001b[0m, in \u001b[0;36mCombinedLoader.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 285\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_iterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iterator, _Sequential):\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
821
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pytorch_lightning/utilities/combined_loader.py:123\u001b[0m, in \u001b[0;36m_Sequential.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m\n\u001b[1;32m 122\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 123\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterators\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
822
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
823
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 670\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m 673\u001b[0m data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
824
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
825
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
826
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/data/audio_to_label.py:327\u001b[0m, in \u001b[0;36m_AudioLabelDataset.__getitem__\u001b[0;34m(self, index)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m offset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 325\u001b[0m offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 327\u001b[0m features \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeaturizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 328\u001b[0m f, fl \u001b[38;5;241m=\u001b[39m features, torch\u001b[38;5;241m.\u001b[39mtensor(features\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mlong()\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_regression_task:\n",
827
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/features.py:186\u001b[0m, in \u001b[0;36mWaveformFeaturizer.process\u001b[0;34m(self, file_path, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess\u001b[39m(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 174\u001b[0m file_path,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 184\u001b[0m normalize_db\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 185\u001b[0m ):\n\u001b[0;32m--> 186\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[43mAudioSegment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample_rate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mint_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mint_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mduration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mduration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_ref\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_ref\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_top_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_top_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_frame_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_frame_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrim_hop_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrim_hop_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43morig_sr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morig_sr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43mchannel_selector\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchannel_selector\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mnormalize_db\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnormalize_db\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocess_segment(audio)\n",
828
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/nemo/collections/asr/parts/preprocessing/segment.py:259\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, audio_file, target_sr, int_values, offset, duration, trim, trim_ref, trim_top_db, trim_frame_length, trim_hop_length, orig_sr, channel_selector, normalize_db, ref_channel)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m HAVE_PYDUB \u001b[38;5;129;01mand\u001b[39;00m samples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 259\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[43mAudio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 260\u001b[0m sample_rate \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mframe_rate\n\u001b[1;32m 261\u001b[0m num_channels \u001b[38;5;241m=\u001b[39m samples\u001b[38;5;241m.\u001b[39mchannels\n",
829
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/audio_segment.py:651\u001b[0m, in \u001b[0;36mAudioSegment.from_file\u001b[0;34m(cls, file, format, codec, parameters, start_second, duration, **kwargs)\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 650\u001b[0m filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 651\u001b[0m file, close_file \u001b[38;5;241m=\u001b[39m \u001b[43m_fd_or_path_or_tempfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtempfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mformat\u001b[39m:\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28mformat\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m.\u001b[39mlower()\n",
830
- "File \u001b[0;32m~/.conda/envs/transcribe/lib/python3.9/site-packages/pydub/utils.py:60\u001b[0m, in \u001b[0;36m_fd_or_path_or_tempfile\u001b[0;34m(fd, mode, tempfile)\u001b[0m\n\u001b[1;32m 57\u001b[0m close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(fd, basestring):\n\u001b[0;32m---> 60\u001b[0m fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m close_fd \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
831
- "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/User/en_tr_pt_titanet_large/data/cv-corpus-15.0-2023-09-08/tr/clips/common_voice_tr_26644120.wav'"
832
- ]
833
- }
834
- ],
835
- "source": [
836
- "# Fine-tune the model with Portuguese language\n",
837
- "\n",
838
- "import torch\n",
839
- "import pytorch_lightning as pl\n",
840
- "import nemo\n",
841
- "import nemo.collections.asr as nemo_asr\n",
842
- "from omegaconf import OmegaConf\n",
843
- "from nemo.utils.exp_manager import exp_manager\n",
844
- "\n",
845
- "# Fine-tune the model with Turkish language\n",
846
- "tr_config = OmegaConf.load(\"conf/titanet-finetune.yaml\")\n",
847
- "## set up the trainer\n",
848
- "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
849
- "\n",
850
- "tr_trainer_config = OmegaConf.create(dict(\n",
851
- " devices=4,\n",
852
- " accelerator=accelerator,\n",
853
- " max_epochs=5,\n",
854
- " max_steps=-1, # computed at runtime if not set\n",
855
- " num_nodes=1,\n",
856
- " accumulate_grad_batches=1,\n",
857
- " enable_checkpointing=False, # Provided by exp_manager\n",
858
- " logger=False, # Provided by exp_manager\n",
859
- " log_every_n_steps=1, # Interval of logging.\n",
860
- " val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations\n",
861
- "))\n",
862
- "print(OmegaConf.to_yaml(tr_trainer_config))\n",
863
- "tr_trainer_finetune = pl.Trainer(**tr_trainer_config)\n",
864
- "\n",
865
- "\n",
866
- "#set up the nemo experiment for logging and monitoring purpose\n",
867
- "log_dir_finetune = exp_manager(tr_trainer_finetune, tr_config.get(\"exp_manager\", None))\n",
868
- "\n",
869
- "\n",
870
- "# set up the manifest file for Turkish language\n",
871
- "tr_config.model.train_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/train.json'\n",
872
- "tr_config.model.validation_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/dev.json'\n",
873
- "tr_config.model.test_ds.manifest_filepath = 'data/cv-corpus-15.0-2023-09-08/tr/test.json'\n",
874
- "tr_config.model.decoder.num_classes = merged_tr_train_df['label'].nunique()\n",
875
- "\n",
876
- "\n",
877
- "# set up the model for Turkish language and train the model\n",
878
- "speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=tr_config.model, trainer=tr_trainer_finetune)\n",
879
- "speaker_model.maybe_init_from_pretrained_checkpoint(tr_config)\n",
880
- "tr_trainer_finetune.fit(speaker_model)\n",
881
- "\n",
882
- "# Save the model after fine-tuning with Turkish language\n",
883
- "\n",
884
- "speaker_model.save_to('titanet_finetune_tr.nemo')"
885
- ]
886
- },
887
- {
888
- "cell_type": "code",
889
- "execution_count": null,
890
- "metadata": {},
891
- "outputs": [],
892
- "source": []
893
- }
894
- ],
895
- "metadata": {
896
- "accelerator": "GPU",
897
- "colab": {
898
- "collapsed_sections": [],
899
- "name": "Speaker_Recogniton_Verification.ipynb",
900
- "provenance": [],
901
- "toc_visible": true
902
- },
903
- "kernelspec": {
904
- "display_name": "transcribe",
905
- "language": "python",
906
- "name": "conda-env-.conda-transcribe-py"
907
- },
908
- "language_info": {
909
- "codemirror_mode": {
910
- "name": "ipython",
911
- "version": 3
912
- },
913
- "file_extension": ".py",
914
- "mimetype": "text/x-python",
915
- "name": "python",
916
- "nbconvert_exporter": "python",
917
- "pygments_lexer": "ipython3",
918
- "version": "3.9.16"
919
- }
920
- },
921
- "nbformat": 4,
922
- "nbformat_minor": 4
923
- }