diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/49epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/49epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..d35a909424dcc9377dcd1380202ac4b385da376c --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/49epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13942d8df6c2326cc0f4f5df1745ba9a6ee792b54ca65b85c322e9ebdad81439 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd395af2fd896d2df2dddb8ce18833e7644b765 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a034944163cbf78cb911dd3e9e7e5561b8516429ce774c7669808a12a8f459b +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/53epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/53epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..97edf0e11183002fba02e2acc6b8a7503f7eeb32 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/53epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4a2014dacb245d48ec2e038cb352ff0902f18bc0156ce2b94eb1d321f2a5ab +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..547cd258f4d14f13719b34e633a520dda8a5d661 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50c499c23cef8489938aeb927367d479b55b109e852fde28fa17a14af64a8a76 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7cd2c1599ecd9af30c8543dc0d7412454d279ed --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2930d5d0bb9b0029c8906d01748174b5663905ab969ee2137f019f2a72a02c7c +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/56epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/56epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4495cd37187af88b5a6e71200a26839623e935c --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/56epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d295b116b9891ef8520e7987d0682f79636305c96362e7067cfa63096f9e2e2e +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..13d6c20138f7e2a02c644303eca9e2a37b36158c --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec090b65d90f017b160f5718d059cbbec01424491ae31ddb62e16255f64cc80a +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..66ec4bba50ebfdc1c67f2c21d88e5c707fa5a8aa --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d4fb13b7ec8fcbc0e79b48c2f2e3ca03db84a62c6aecb96db6ec70d992daee +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..147771fda88d4e5473bfe6ea2ada0b5771f2d8a1 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63fced0fdc8817cd4f376fe8e33ffc3b07a8615ce2151f6b5bd6be0990bb75ff +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..d28d5bd6d2e3430a2a9082ff03bddbac84e58642 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ab99d72231e8b471e6f66d902a88c36315286ecfc4bb77cf1efcd73bfe200db +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..170157f409d67bcd91be3d8f33518794102d04ba --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md @@ -0,0 +1,196 @@ + +# RESULTS +## Environments +- date: `Tue Mar 5 09:47:19 CST 2024` +- python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]` +- espnet version: `espnet 202308` +- pytorch version: `pytorch 1.12.1+cu116` +- Git hash: `884659f9ee95374811015381c976fa3b4f6e01db` + - Commit date: `Thu Nov 23 00:23:29 2023 +0800` + +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_30epoch/test_clean_kaldi_fmt|961|64007|80.7|15.1|4.2|6.7|26.0|98.9| +|decode_sot_asr_model_30epoch/test_other_kaldi_fmt|992|80370|75.0|19.6|5.4|7.8|32.9|99.6| +|decode_sot_asr_model_35epoch/test_clean_kaldi_fmt|961|64007|81.5|14.5|4.0|7.4|26.0|98.6| +|decode_sot_asr_model_35epoch/test_other_kaldi_fmt|992|80370|75.7|18.8|5.5|9.0|33.2|99.4| +|decode_sot_asr_model_40epoch/test_clean_kaldi_fmt|961|64007|80.4|14.2|5.4|4.8|24.4|97.7| +|decode_sot_asr_model_40epoch/test_other_kaldi_fmt|992|80370|74.8|18.1|7.1|5.7|30.9|99.5| +|decode_sot_asr_model_8epoch/test_clean_kaldi_fmt|961|64007|70.4|23.7|5.8|7.2|36.7|99.1| +|decode_sot_asr_model_8epoch/test_other_kaldi_fmt|992|80370|63.7|28.7|7.5|8.6|44.8|99.6| +|decode_sot_asr_model_valid.acc.best/dev|3000|126853|54.3|32.2|13.4|28.9|74.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|226216|76.8|11.5|11.6|12.4|35.5|99.2| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk1|1606|135101|36.6|21.7|41.7|3.2|66.7|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk2|1606|135101|35.5|21.5|43.0|3.4|67.9|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|209679|63.9|20.3|15.8|10.4|46.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk_kaldi_fmt|1004|124462|67.6|17.1|15.2|7.2|39.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|200029|52.0|27.4|20.7|11.8|59.9|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk_kaldi_fmt|721|119166|55.5|22.9|21.6|11.3|55.8|100.0| +|decode_sot_asr_model_valid.acc.best/dev_oracle|544|10798|85.9|12.2|1.9|88.7|102.9|92.8| +|decode_sot_asr_model_valid.acc.best/eval_oracle|4479|96585|84.7|13.0|2.4|88.2|103.6|94.5| +|decode_sot_asr_model_valid.acc.best/sot_sdm1_dev|2382|35243|0.0|0.0|100.0|0.0|100.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|301042|77.5|10.9|11.6|11.3|33.8|99.5| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|212871|64.4|19.3|16.3|11.8|47.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|185394|53.2|26.2|20.6|10.6|57.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|336490|75.9|13.3|10.8|11.6|35.6|99.9| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|266074|60.5|23.6|15.9|11.5|51.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|259138|49.2|30.2|20.6|11.1|61.9|100.0| +|decode_sot_asr_model_valid.acc.best/test|3000|114243|55.6|30.7|13.7|32.9|77.3|99.9| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|178761|80.2|9.0|10.8|7.5|27.2|98.9| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk1|2180|178761|33.1|21.4|45.4|3.0|69.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk2|2180|178761|31.6|24.4|44.0|3.0|71.4|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_3spk_kaldi_fmt|977|124741|66.8|17.5|15.7|9.0|42.2|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_4spk_kaldi_fmt|632|109072|56.3|22.4|21.3|8.8|52.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_kaldi_fmt|961|64007|81.4|13.3|5.3|5.9|24.5|97.5| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|205496|78.6|11.7|9.6|7.8|29.1|99.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk1|2363|205496|28.2|26.2|45.6|3.1|74.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk2|2363|205496|35.6|22.3|42.1|3.1|67.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_3spk_kaldi_fmt|1246|162996|62.5|21.9|15.6|9.1|46.6|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_4spk_kaldi_fmt|901|157123|51.9|26.0|22.1|10.4|58.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_kaldi_fmt|992|80370|75.5|17.6|6.9|6.8|31.3|99.3| +|decode_sot_css_asr_model_valid.acc.best/dev_oracle|544|10798|85.0|12.9|2.1|48.0|63.0|92.8| +|decode_sot_css_asr_model_valid.acc.best/eval_oracle|4479|96585|84.1|13.2|2.7|49.1|65.0|94.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_30epoch/test_clean_kaldi_fmt|961|329390|89.3|6.0|4.6|6.7|17.4|98.9| +|decode_sot_asr_model_30epoch/test_other_kaldi_fmt|992|416899|85.6|8.2|6.3|7.7|22.2|99.6| +|decode_sot_asr_model_35epoch/test_clean_kaldi_fmt|961|329390|89.8|5.8|4.4|7.4|17.6|98.6| +|decode_sot_asr_model_35epoch/test_other_kaldi_fmt|992|416899|85.9|8.0|6.1|8.5|22.6|99.4| +|decode_sot_asr_model_40epoch/test_clean_kaldi_fmt|961|329390|88.6|5.9|5.5|4.5|16.0|97.7| +|decode_sot_asr_model_40epoch/test_other_kaldi_fmt|992|416899|84.5|8.0|7.5|5.2|20.7|99.5| +|decode_sot_asr_model_8epoch/test_clean_kaldi_fmt|961|329390|83.8|9.3|6.9|7.2|23.5|99.1| +|decode_sot_asr_model_8epoch/test_other_kaldi_fmt|992|416899|78.7|12.1|9.2|8.2|29.5|99.6| +|decode_sot_asr_model_valid.acc.best/dev|3000|673222|71.1|13.3|15.6|28.5|57.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|1230801|83.7|6.2|10.1|10.8|27.2|99.2| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk1|1606|735694|47.9|9.1|43.0|3.6|55.7|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk2|1606|735694|47.1|9.1|43.8|3.7|56.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|1140428|74.6|10.3|15.2|9.3|34.7|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk_kaldi_fmt|1004|677017|76.8|8.2|15.0|6.2|29.4|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|1087409|65.5|13.1|21.3|10.3|44.7|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk_kaldi_fmt|721|647884|67.3|10.5|22.2|8.9|41.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_oracle|544|57590|94.8|3.2|2.0|86.2|91.5|92.8| +|decode_sot_asr_model_valid.acc.best/eval_oracle|4479|522239|93.8|3.6|2.6|85.2|91.4|94.5| +|decode_sot_asr_model_valid.acc.best/sot_sdm1_dev|2382|169857|0.0|0.0|100.0|0.0|100.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|1550429|84.0|6.1|10.0|9.9|25.9|99.5| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|1084475|74.6|10.4|15.1|10.2|35.7|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|938467|66.3|13.1|20.7|9.9|43.6|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|1742136|83.5|7.0|9.4|10.0|26.5|99.9| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|1381987|72.6|12.0|15.4|10.0|37.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|1346646|63.9|14.4|21.7|9.9|46.0|100.0| +|decode_sot_asr_model_valid.acc.best/test|3000|608408|71.7|12.5|15.8|32.4|60.7|99.9| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|921344|85.5|5.0|9.5|6.1|20.6|98.9| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk1|2180|921344|44.6|8.9|46.5|3.5|58.8|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk2|2180|921344|44.9|10.2|44.9|3.7|58.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_3spk_kaldi_fmt|977|635802|76.0|9.0|15.0|7.7|31.7|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_4spk_kaldi_fmt|632|552325|67.8|10.6|21.6|7.8|39.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_kaldi_fmt|961|329390|89.2|5.3|5.4|5.8|16.6|97.5| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|1064868|85.3|6.1|8.6|6.4|21.1|99.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk1|2363|1064868|42.2|11.1|46.7|3.8|61.6|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk2|2363|1064868|47.8|9.2|43.0|3.7|55.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_3spk_kaldi_fmt|1246|847159|73.8|10.5|15.6|7.9|34.1|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_4spk_kaldi_fmt|901|817228|65.0|11.9|23.1|8.7|43.7|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_kaldi_fmt|992|416899|85.1|7.3|7.6|6.8|21.7|99.3| +|decode_sot_css_asr_model_valid.acc.best/dev_oracle|544|57590|93.7|4.0|2.4|46.5|52.8|92.8| +|decode_sot_css_asr_model_valid.acc.best/eval_oracle|4479|522239|92.9|4.1|3.0|47.2|54.3|94.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_30epoch +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|47659|78.4|16.5|5.1|5.5|27.1|97.2| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|258151|87.5|6.5|6.1|5.6|18.1|97.2| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_35epoch +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|47659|79.4|16.2|4.4|6.8|27.4|98.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|258151|88.2|6.5|5.3|6.7|18.5|98.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_40epoch +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|47659|78.5|15.6|5.8|4.0|25.5|97.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|258151|87.0|6.3|6.7|3.9|16.9|97.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_8epoch +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|47659|68.4|24.9|6.7|6.7|38.2|98.3| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_kaldi_fmt|605|258151|81.6|9.9|8.5|6.7|25.2|98.3| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_valid.acc.best +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_2spk_kaldi_fmt|1606|135101|79.1|9.8|11.1|8.4|29.2|98.4| +|org/dev_kaldi_fmt|605|47659|78.8|15.0|6.2|5.7|26.9|97.5| +|org/sot_sdm1_eval|2385|37529|33.4|53.9|12.7|102.0|168.6|100.0| +|org/tt_mix_clean_reverb_max_16k|3000|3000|0.0|100.0|0.0|4047.2|4147.2|100.0| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_2spk_kaldi_fmt|1606|735694|84.8|5.3|9.9|6.9|22.1|98.4| +|org/dev_kaldi_fmt|605|258151|87.1|6.0|6.9|5.7|18.6|97.5| +|org/sot_sdm1_eval|2385|183036|62.5|22.7|14.8|97.8|135.2|100.0| +|org/tt_mix_clean_reverb_max_16k|3000|143026|17.3|82.6|0.1|411.1|493.8|100.0| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth new file mode 100644 index 0000000000000000000000000000000000000000..64795a1a6ec1a8d46b29c5a8ae4ca57287c15c64 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14eb244b2d0dfc6d2dc23b8b9a10e1814df51f163d680c62439ff31374e6bd2e +size 516820547 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..976b79e692f0867b7bcbcb47e361a04dac242651 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml @@ -0,0 +1,227 @@ +config: conf/tuning/train_sot_asr_conformer_medium.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new +ngpu: 1 +seed: 0 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: 4 +dist_rank: 0 +local_rank: 0 +dist_master_addr: localhost +dist_master_port: 44319 +dist_launcher: null +multiprocessing_distributed: true +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: false +write_collected_feats: false +max_epoch: 60 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: +- /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 16000000 +valid_batch_bins: null +train_shape_file: +- exp/asr_stats_raw_en_char_sp/train/speech_shape +- exp/asr_stats_raw_en_char_sp/train/text_shape.char +valid_shape_file: +- exp/asr_stats_raw_en_char_sp/valid/speech_shape +- exp/asr_stats_raw_en_char_sp/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train_medium_kaldi_fmt_sp/wav.scp + - speech + - kaldi_ark +- - dump/raw/train_medium_kaldi_fmt_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/raw/dev_kaldi_fmt/wav.scp + - speech + - kaldi_ark +- - dump/raw/dev_kaldi_fmt/text + - text + - text +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 20000 +token_list: +- +- +- +- +- E +- T +- A +- O +- N +- I +- H +- S +- R +- D +- L +- U +- M +- C +- W +- F +- G +- Y +- P +- B +- V +- K +- '''' +- X +- J +- Q +- Z +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true +joint_net_conf: null +use_preprocessor: true +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + fs: 16k +specaug: null +specaug_conf: {} +normalize: global_mvn +normalize_conf: + stats_file: exp/asr_stats_raw_en_char_sp/train/feats_stats.npz +model: espnet +model_conf: + ctc_weight: 0.0 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: conformer +encoder_conf: + output_size: 256 + attention_heads: 4 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 +preprocessor: multi +preprocessor_conf: + speaker_change_symbol: + - +required: +- output_dir +- token_list +version: '202308' +distributed: true diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..6840eee97dc6e9b28823cec6665d1a342d1e973b Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..37a1ced158766efa81991447a9f7798399395078 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..c8a2dc91037c65d4a07d3f085820315ba8a918dc Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..64d86128414468d9249e125df0cc43c061a603ba Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..d856007dcf757ca6af12ac18920e07fed2fa8011 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..9eda0adfa6a34b26efc82b0e910c7570ba4b62de Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..fd15b0ea7f135031b8efc80ccc9cb3fd91f6c90b Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..81b0ff7ffa3a2ea6e0b8b851ece8f6b2c8ffb2ea Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c84410a6dbc3542f75cce869fd1dc7c1bffeae39 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..fb100ac113b5e1b6ad10054350290fada2fe17f3 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..e33f9c0cddeb0df980ab8788ea9c515b564c7dcb Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..252d7d1f2afe53b431c36f9aca23555f612d67f3 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..c52e4a47fb7e93ae7158201365a1f94c235db1e3 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..af37a6766e4857c135b2b01a74211a9cc41668ac Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..7b0208e69be916c75e573db4fecb1c5376f5e400 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth new file mode 100644 index 0000000000000000000000000000000000000000..d28d5bd6d2e3430a2a9082ff03bddbac84e58642 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ab99d72231e8b471e6f66d902a88c36315286ecfc4bb77cf1efcd73bfe200db +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..91f2d5c7f8a22dc58122284218fb9863f50e083c --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh @@ -0,0 +1 @@ +./asr.sh --lang en --audio_format flac.ark --stage 12 --feats_type raw --token_type char --sot_asr true --max_wav_duration 50 --speed_perturb_factors '0.9 1.0 1.1' --feats_normalize global_mvn --use_lm false --pretrained_model /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --asr_config conf/tuning/train_sot_asr_conformer_medium.yaml --lm_config conf/tuning/train_lm_transformer.yaml --inference_config conf/tuning/decode_sot.yaml --train_set train_medium_kaldi_fmt --valid_set dev_kaldi_fmt --test_sets 'dev_kaldi_fmt test_clean_kaldi_fmt test_other_kaldi_fmt' --ngpu 4 --asr_tag train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --lm_train_text data/local/other_text/text --bpe_train_text data/train_medium_kaldi_fmt/text --stage 12 "$@"; exit $? diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.0 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.0 new file mode 100644 index 0000000000000000000000000000000000000000..c94de23601d379c519af237e10a55b05c2495079 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30cf181e3c82c4cc12d7aebfb77b819aa2da1fd08a2d567519ad1b5196e3171 +size 88 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.0 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.0 new file mode 100644 index 0000000000000000000000000000000000000000..52d99cae8a983fa9c89e7cd537f2c48a84dfd5a0 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0848df7dac3f565c375a3f38dc7a661fd41628f61bffb3d75e70e3641706faec +size 107387014 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.0 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.0 new file mode 100644 index 0000000000000000000000000000000000000000..7a78ff05850cac7c903e633fcbebca17cdd72141 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c40b67a8b2a99f63326ec1b7b8685ca88cc3ed776a10e39c5273e164c8982b67 +size 8306 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.0 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.0 new file mode 100644 index 0000000000000000000000000000000000000000..fdf9e875587e26a9b15b19facf4030248d002575 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08432dbcf98ebaad27388a1b90b781f1aef124b1ac2c3c4676f8a387c81c0c1a +size 743299213 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.1 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.1 new file mode 100644 index 0000000000000000000000000000000000000000..30372d139d39d338aa146d8ecc926370f4d5e46e --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700307214.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1753232.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05d778de3f4cc8bdfe77155d9f1e2d6782358f94fdcf930fd16cb4b391e0f81f +size 88 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.1 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.1 new file mode 100644 index 0000000000000000000000000000000000000000..1866a8044b2f959fc8fee768aee8281ba7c54a51 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700310367.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2045481.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ead35607a81bfff4e5bab9f6dd496480516f7350927ad98a7a4db115507b2ee +size 2338 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.1 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.1 new file mode 100644 index 0000000000000000000000000000000000000000..cb0b83de916a45234c6263eb960640c278deebf9 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700412292.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.1823743.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e422f849535779325246177b90cb5193453b58ac6cceb2e15640105036c361a8 +size 88 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.1 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.1 new file mode 100644 index 0000000000000000000000000000000000000000..429ae0e6b9f19e2cb98659fc6fe32c3b654b4e2f --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1700536345.de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb.2992215.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143371316cfd5edd14fe5c0cc235d84980859e4dd329af2b61fcdd64eb7b93fb +size 14752 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log new file mode 100644 index 0000000000000000000000000000000000000000..c1688536aa6786d0cd7cabfb142d6fbee1bf3fd7 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log @@ -0,0 +1,773 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Mon Nov 20 00:43:12 CST 2023 +# +/star-home/jinzengrui/lib/miniconda3/envs/dev39/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:23,119 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:23,119 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:23,174 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,610 (abs_task:1229) INFO: pytorch.version=1.11.0+cu102, cuda.available=True, cudnn.version=7605, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,624 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,624 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + eps: 1e-08 + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,624 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,626 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:31,692 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:35,293 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,615 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_medium_kaldi_fmt_sp/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_medium_kaldi_fmt_sp/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,616 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=10615, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,620 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=10615, mean=53.4, min=7, max=201 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,737 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,747 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,747 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=12, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,747 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=12, mean=50.4, min=17, max=82 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,752 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,775 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,775 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=605, batch_size=1, key_file=exp/asr_stats_raw_en_char_sp/valid/speech_shape, +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:44,775 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:48,183 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda10.2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823744 [1] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823745 [2] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823746 [3] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823744 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823745 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823746 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823744 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823744 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823744 [1] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823746 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823746 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823746 [3] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823745 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823745 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823745 [2] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Setting affinity for GPU 3 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Setting affinity for GPU 2 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Setting affinity for GPU 1 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Setting affinity for GPU 0 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Channel 00 : 3[8000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Channel 00 : 2[7000] -> 3[8000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Channel 01 : 2[7000] -> 3[8000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Channel 00 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Channel 01 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Channel 00 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Channel 01 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Channel 01 : 3[8000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Channel 00 : 3[8000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Channel 01 : 3[8000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Channel 00 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Channel 00 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Channel 01 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Channel 01 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823746:1823875 [3] NCCL INFO comm 0x7f4294001200 rank 3 nranks 4 cudaDev 3 busId 8000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823870 [0] NCCL INFO comm 0x7f450c001200 rank 0 nranks 4 cudaDev 0 busId 4000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823745:1823876 [2] NCCL INFO comm 0x7f631c001200 rank 2 nranks 4 cudaDev 2 busId 7000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823744:1823874 [1] NCCL INFO comm 0x7fa7ac001200 rank 1 nranks 4 cudaDev 1 busId 6000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1823743:1823743 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:44:52,106 (trainer:284) INFO: 9/60epoch started +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:49:07,241 (distributed:948) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 00:55:25,795 (trainer:732) INFO: 9epoch:train:1-530batch: iter_time=0.002, forward_time=0.209, loss_att=107.878, acc=0.904, loss=107.878, backward_time=0.325, grad_norm=103.272, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=4.789 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:01:45,507 (trainer:732) INFO: 9epoch:train:531-1060batch: iter_time=2.446e-04, forward_time=0.209, loss_att=112.672, acc=0.903, loss=112.672, backward_time=0.327, grad_norm=105.518, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:08:04,727 (trainer:732) INFO: 9epoch:train:1061-1590batch: iter_time=2.303e-04, forward_time=0.208, loss_att=109.250, acc=0.905, loss=109.250, backward_time=0.327, grad_norm=112.951, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:14:24,051 (trainer:732) INFO: 9epoch:train:1591-2120batch: iter_time=3.562e-04, forward_time=0.211, loss_att=109.274, acc=0.906, loss=109.274, backward_time=0.327, grad_norm=104.180, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:20:44,930 (trainer:732) INFO: 9epoch:train:2121-2650batch: iter_time=3.394e-04, forward_time=0.211, loss_att=110.221, acc=0.905, loss=110.221, backward_time=0.328, grad_norm=110.200, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:27:04,841 (trainer:732) INFO: 9epoch:train:2651-3180batch: iter_time=3.402e-04, forward_time=0.211, loss_att=111.572, acc=0.905, loss=111.572, backward_time=0.328, grad_norm=102.797, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:33:23,833 (trainer:732) INFO: 9epoch:train:3181-3710batch: iter_time=3.449e-04, forward_time=0.210, loss_att=108.396, acc=0.906, loss=108.396, backward_time=0.327, grad_norm=103.265, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:39:42,244 (trainer:732) INFO: 9epoch:train:3711-4240batch: iter_time=3.845e-04, forward_time=0.210, loss_att=105.457, acc=0.907, loss=105.457, backward_time=0.326, grad_norm=105.516, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:46:01,633 (trainer:732) INFO: 9epoch:train:4241-4770batch: iter_time=3.412e-04, forward_time=0.211, loss_att=110.350, acc=0.906, loss=110.350, backward_time=0.327, grad_norm=104.195, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.002, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:52:20,391 (trainer:732) INFO: 9epoch:train:4771-5300batch: iter_time=3.219e-04, forward_time=0.210, loss_att=106.497, acc=0.907, loss=106.497, backward_time=0.327, grad_norm=105.330, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 01:58:41,240 (trainer:732) INFO: 9epoch:train:5301-5830batch: iter_time=3.283e-04, forward_time=0.211, loss_att=108.151, acc=0.909, loss=108.151, backward_time=0.328, grad_norm=100.440, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 02:05:01,053 (trainer:732) INFO: 9epoch:train:5831-6360batch: iter_time=3.129e-04, forward_time=0.210, loss_att=107.337, acc=0.908, loss=107.337, backward_time=0.328, grad_norm=105.336, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.002, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 02:11:21,060 (trainer:732) INFO: 9epoch:train:6361-6890batch: iter_time=3.236e-04, forward_time=0.210, loss_att=106.133, acc=0.909, loss=106.133, backward_time=0.328, grad_norm=104.171, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-20 02:17:40,657 (trainer:732) INFO: 9epoch:train:6891-7420batch: iter_time=3.511e-04, forward_time=0.210, loss_att=107.243, acc=0.908, loss=107.243, backward_time=0.327, grad_norm=108.454, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.863 +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGKILL +# Accounting: time=5952 threads=1 +# Ended (code 1) at Mon Nov 20 02:22:24 CST 2023, elapsed time 5952 seconds +/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 224 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.2.log b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.2.log new file mode 100644 index 0000000000000000000000000000000000000000..9f77154217e574e89cd40eb403871a9cac08a3d5 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.2.log @@ -0,0 +1,1800 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Sat Nov 18 20:24:31 CST 2023 +# +/star-home/jinzengrui/lib/miniconda3/envs/dev39/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:34787 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:34787 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:34787 (errno: 99 - Cannot assign requested address). +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:40,791 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:40,792 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:40,859 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,463 (abs_task:1229) INFO: pytorch.version=1.11.0+cu102, cuda.available=True, cudnn.version=7605, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,478 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,478 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + eps: 1e-08 + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,478 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,480 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:49,505 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:25:53,304 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:03,944 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_medium_kaldi_fmt_sp/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_medium_kaldi_fmt_sp/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:03,944 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=10615, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:03,949 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=10615, mean=53.4, min=7, max=201 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,095 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,104 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,104 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=12, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,104 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=12, mean=50.4, min=17, max=82 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,109 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,132 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,132 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=605, batch_size=1, key_file=exp/asr_stats_raw_en_char_sp/valid/speech_shape, +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:04,132 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda10.2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2045482 [1] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2045483 [2] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2045483 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2045482 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2045484 [3] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2045484 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2045483 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2045483 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2045483 [2] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2045482 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2045482 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2045482 [1] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2045484 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2045484 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2045484 [3] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Setting affinity for GPU 1 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Setting affinity for GPU 2 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Setting affinity for GPU 5 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Setting affinity for GPU 0 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Channel 00 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Channel 00 : 2[7000] -> 3[e000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Channel 01 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Channel 01 : 2[7000] -> 3[e000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Channel 00 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Channel 01 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Channel 00 : 3[e000] -> 0[4000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Channel 01 : 3[e000] -> 0[4000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Channel 00 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Channel 01 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Channel 00 : 3[e000] -> 2[7000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Channel 01 : 3[e000] -> 2[7000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Channel 00 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Channel 01 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049222 [1] NCCL INFO comm 0x7f3bc8001200 rank 1 nranks 4 cudaDev 1 busId 6000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049223 [3] NCCL INFO comm 0x7fbce0001200 rank 3 nranks 4 cudaDev 3 busId e000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049178 [0] NCCL INFO comm 0x7f76b4001200 rank 0 nranks 4 cudaDev 0 busId 4000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2045481 [0] NCCL INFO Launch mode Parallel +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049221 [2] NCCL INFO comm 0x7f9a60001200 rank 2 nranks 4 cudaDev 2 busId 7000 - Init COMPLETE +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:26:07,879 (trainer:284) INFO: 1/60epoch started +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:30:00,279 (distributed:948) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:36:35,905 (trainer:732) INFO: 1epoch:train:1-530batch: iter_time=0.004, forward_time=0.227, loss_att=840.974, acc=0.440, loss=840.974, backward_time=0.332, grad_norm=591.509, clip=100.000, loss_scale=1.000, optim_step_time=0.128, optim0_lr0=6.750e-06, train_time=4.745 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:43:08,839 (trainer:732) INFO: 1epoch:train:531-1060batch: iter_time=3.654e-04, forward_time=0.221, loss_att=563.079, acc=0.540, loss=563.079, backward_time=0.330, grad_norm=162.286, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=2.000e-05, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:49:41,581 (trainer:732) INFO: 1epoch:train:1061-1590batch: iter_time=3.514e-04, forward_time=0.220, loss_att=510.041, acc=0.581, loss=510.041, backward_time=0.330, grad_norm=114.157, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=3.325e-05, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 20:56:13,955 (trainer:732) INFO: 1epoch:train:1591-2120batch: iter_time=3.647e-04, forward_time=0.219, loss_att=480.065, acc=0.603, loss=480.065, backward_time=0.330, grad_norm=84.948, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=4.650e-05, train_time=2.960 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:02:46,755 (trainer:732) INFO: 1epoch:train:2121-2650batch: iter_time=3.485e-04, forward_time=0.217, loss_att=466.668, acc=0.617, loss=466.668, backward_time=0.330, grad_norm=80.977, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=5.975e-05, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:09:14,878 (trainer:732) INFO: 1epoch:train:2651-3180batch: iter_time=3.327e-04, forward_time=0.214, loss_att=443.970, acc=0.628, loss=443.970, backward_time=0.329, grad_norm=72.737, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=7.300e-05, train_time=2.927 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:15:45,728 (trainer:732) INFO: 1epoch:train:3181-3710batch: iter_time=3.577e-04, forward_time=0.215, loss_att=436.701, acc=0.640, loss=436.701, backward_time=0.330, grad_norm=75.238, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=8.625e-05, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:22:15,120 (trainer:732) INFO: 1epoch:train:3711-4240batch: iter_time=3.319e-04, forward_time=0.214, loss_att=421.203, acc=0.646, loss=421.203, backward_time=0.329, grad_norm=71.518, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=9.950e-05, train_time=2.937 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:28:45,267 (trainer:732) INFO: 1epoch:train:4241-4770batch: iter_time=3.443e-04, forward_time=0.214, loss_att=415.274, acc=0.656, loss=415.274, backward_time=0.330, grad_norm=77.310, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=1.127e-04, train_time=2.945 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:35:13,845 (trainer:732) INFO: 1epoch:train:4771-5300batch: iter_time=3.489e-04, forward_time=0.213, loss_att=400.443, acc=0.662, loss=400.443, backward_time=0.329, grad_norm=76.588, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=1.260e-04, train_time=2.931 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:41:44,041 (trainer:732) INFO: 1epoch:train:5301-5830batch: iter_time=3.442e-04, forward_time=0.214, loss_att=395.009, acc=0.667, loss=395.009, backward_time=0.330, grad_norm=76.219, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=1.392e-04, train_time=2.946 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:48:13,420 (trainer:732) INFO: 1epoch:train:5831-6360batch: iter_time=3.827e-04, forward_time=0.214, loss_att=391.748, acc=0.674, loss=391.748, backward_time=0.331, grad_norm=77.097, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=1.525e-04, train_time=2.937 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 21:54:40,619 (trainer:732) INFO: 1epoch:train:6361-6890batch: iter_time=3.671e-04, forward_time=0.212, loss_att=374.046, acc=0.679, loss=374.046, backward_time=0.329, grad_norm=79.413, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=1.657e-04, train_time=2.923 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:01:10,142 (trainer:732) INFO: 1epoch:train:6891-7420batch: iter_time=3.683e-04, forward_time=0.213, loss_att=384.333, acc=0.686, loss=384.333, backward_time=0.331, grad_norm=83.976, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=1.790e-04, train_time=2.938 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:07:36,692 (trainer:732) INFO: 1epoch:train:7421-7950batch: iter_time=3.666e-04, forward_time=0.212, loss_att=363.886, acc=0.690, loss=363.886, backward_time=0.329, grad_norm=78.427, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=1.922e-04, train_time=2.918 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:14:04,028 (trainer:732) INFO: 1epoch:train:7951-8480batch: iter_time=3.698e-04, forward_time=0.213, loss_att=354.762, acc=0.695, loss=354.762, backward_time=0.328, grad_norm=80.348, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=2.055e-04, train_time=2.922 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:20:33,599 (trainer:732) INFO: 1epoch:train:8481-9010batch: iter_time=3.703e-04, forward_time=0.214, loss_att=357.177, acc=0.701, loss=357.177, backward_time=0.331, grad_norm=85.728, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.187e-04, train_time=2.941 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:27:02,679 (trainer:732) INFO: 1epoch:train:9011-9540batch: iter_time=3.536e-04, forward_time=0.211, loss_att=349.903, acc=0.705, loss=349.903, backward_time=0.330, grad_norm=82.544, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.320e-04, train_time=2.934 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:33:29,226 (trainer:732) INFO: 1epoch:train:9541-10070batch: iter_time=3.541e-04, forward_time=0.212, loss_att=342.667, acc=0.708, loss=342.667, backward_time=0.329, grad_norm=88.063, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=2.452e-04, train_time=2.918 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:39:58,466 (trainer:732) INFO: 1epoch:train:10071-10600batch: iter_time=3.931e-04, forward_time=0.214, loss_att=339.918, acc=0.712, loss=339.918, backward_time=0.330, grad_norm=91.257, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.585e-04, train_time=2.936 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:48:34,060 (trainer:338) INFO: 1epoch results: [train] iter_time=5.553e-04, forward_time=0.215, loss_att=431.001, acc=0.647, loss=431.001, backward_time=0.330, grad_norm=111.384, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=1.328e-04, train_time=3.030, time=2 hours, 14 minutes and 11.7 seconds, total_count=10615, gpu_max_cached_mem_GB=30.438, [valid] loss_att=288.430, acc=0.744, cer=0.319, wer=0.639, loss=288.430, time=4 minutes and 15.87 seconds, total_count=12, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 58.55 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:48:40,482 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:48:40,483 (trainer:272) INFO: 2/60epoch started. Estimated time to finish: 5 days, 20 hours and 10 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 22:58:07,854 (trainer:732) INFO: 2epoch:train:1-530batch: iter_time=0.002, forward_time=0.215, loss_att=335.561, acc=0.716, loss=335.561, backward_time=0.331, grad_norm=90.428, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.721e-04, train_time=4.287 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:04:34,184 (trainer:732) INFO: 2epoch:train:531-1060batch: iter_time=3.514e-04, forward_time=0.212, loss_att=320.350, acc=0.723, loss=320.350, backward_time=0.329, grad_norm=89.353, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.853e-04, train_time=2.915 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:11:01,305 (trainer:732) INFO: 2epoch:train:1061-1590batch: iter_time=3.411e-04, forward_time=0.213, loss_att=324.716, acc=0.726, loss=324.716, backward_time=0.329, grad_norm=87.585, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=2.985e-04, train_time=2.922 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:17:28,570 (trainer:732) INFO: 2epoch:train:1591-2120batch: iter_time=3.329e-04, forward_time=0.213, loss_att=320.389, acc=0.729, loss=320.389, backward_time=0.330, grad_norm=98.865, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.118e-04, train_time=2.921 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:23:55,179 (trainer:732) INFO: 2epoch:train:2121-2650batch: iter_time=3.640e-04, forward_time=0.212, loss_att=309.249, acc=0.733, loss=309.249, backward_time=0.329, grad_norm=95.880, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.251e-04, train_time=2.920 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:30:23,363 (trainer:732) INFO: 2epoch:train:2651-3180batch: iter_time=3.530e-04, forward_time=0.212, loss_att=311.422, acc=0.734, loss=311.422, backward_time=0.330, grad_norm=96.128, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.383e-04, train_time=2.927 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:36:51,001 (trainer:732) INFO: 2epoch:train:3181-3710batch: iter_time=3.453e-04, forward_time=0.213, loss_att=307.521, acc=0.739, loss=307.521, backward_time=0.330, grad_norm=92.452, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=3.515e-04, train_time=2.926 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:43:19,740 (trainer:732) INFO: 2epoch:train:3711-4240batch: iter_time=3.351e-04, forward_time=0.213, loss_att=313.679, acc=0.741, loss=313.679, backward_time=0.331, grad_norm=94.447, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.648e-04, train_time=2.933 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:49:45,490 (trainer:732) INFO: 2epoch:train:4241-4770batch: iter_time=3.476e-04, forward_time=0.212, loss_att=300.517, acc=0.743, loss=300.517, backward_time=0.329, grad_norm=94.425, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=3.780e-04, train_time=2.912 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 23:56:13,979 (trainer:732) INFO: 2epoch:train:4771-5300batch: iter_time=4.048e-04, forward_time=0.213, loss_att=302.041, acc=0.749, loss=302.041, backward_time=0.330, grad_norm=99.117, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.913e-04, train_time=2.930 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:02:39,869 (trainer:732) INFO: 2epoch:train:5301-5830batch: iter_time=3.425e-04, forward_time=0.212, loss_att=292.638, acc=0.750, loss=292.638, backward_time=0.328, grad_norm=96.227, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=4.045e-04, train_time=2.913 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:09:06,138 (trainer:732) INFO: 2epoch:train:5831-6360batch: iter_time=3.495e-04, forward_time=0.212, loss_att=288.686, acc=0.751, loss=288.686, backward_time=0.329, grad_norm=93.386, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=4.178e-04, train_time=2.913 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:15:33,034 (trainer:732) INFO: 2epoch:train:6361-6890batch: iter_time=3.297e-04, forward_time=0.212, loss_att=288.106, acc=0.754, loss=288.106, backward_time=0.329, grad_norm=101.259, clip=100.000, loss_scale=1.000, optim_step_time=0.128, optim0_lr0=4.310e-04, train_time=2.921 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:22:02,202 (trainer:732) INFO: 2epoch:train:6891-7420batch: iter_time=3.346e-04, forward_time=0.214, loss_att=286.014, acc=0.758, loss=286.014, backward_time=0.331, grad_norm=100.422, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=4.443e-04, train_time=2.935 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:28:29,699 (trainer:732) INFO: 2epoch:train:7421-7950batch: iter_time=3.430e-04, forward_time=0.213, loss_att=284.547, acc=0.758, loss=284.547, backward_time=0.329, grad_norm=94.321, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=4.575e-04, train_time=2.926 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:34:56,722 (trainer:732) INFO: 2epoch:train:7951-8480batch: iter_time=3.395e-04, forward_time=0.212, loss_att=274.991, acc=0.763, loss=274.991, backward_time=0.328, grad_norm=97.567, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=4.708e-04, train_time=2.919 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:41:23,220 (trainer:732) INFO: 2epoch:train:8481-9010batch: iter_time=3.418e-04, forward_time=0.213, loss_att=277.128, acc=0.762, loss=277.128, backward_time=0.329, grad_norm=93.027, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=4.840e-04, train_time=2.917 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:47:51,468 (trainer:732) INFO: 2epoch:train:9011-9540batch: iter_time=3.621e-04, forward_time=0.213, loss_att=279.367, acc=0.767, loss=279.367, backward_time=0.330, grad_norm=103.551, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=4.973e-04, train_time=2.929 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 00:54:16,566 (trainer:732) INFO: 2epoch:train:9541-10070batch: iter_time=3.496e-04, forward_time=0.211, loss_att=274.194, acc=0.768, loss=274.194, backward_time=0.329, grad_norm=97.588, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=5.106e-04, train_time=2.907 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:00:39,947 (trainer:732) INFO: 2epoch:train:10071-10600batch: iter_time=3.470e-04, forward_time=0.210, loss_att=270.299, acc=0.770, loss=270.299, backward_time=0.327, grad_norm=94.882, clip=100.000, loss_scale=1.000, optim_step_time=0.117, optim0_lr0=5.238e-04, train_time=2.891 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:08:50,568 (trainer:338) INFO: 2epoch results: [train] iter_time=4.546e-04, forward_time=0.213, loss_att=298.069, acc=0.747, loss=298.069, backward_time=0.329, grad_norm=95.535, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=3.981e-04, train_time=2.988, time=2 hours, 12 minutes and 21.09 seconds, total_count=21230, gpu_max_cached_mem_GB=30.438, [valid] loss_att=231.491, acc=0.793, cer=0.260, wer=0.554, loss=231.491, time=4 minutes and 12.29 seconds, total_count=24, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 36.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:08:55,956 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:08:55,958 (trainer:272) INFO: 3/60epoch started. Estimated time to finish: 5 days, 16 hours and 41 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:18:03,893 (trainer:732) INFO: 3epoch:train:1-530batch: iter_time=0.004, forward_time=0.211, loss_att=270.350, acc=0.775, loss=270.350, backward_time=0.328, grad_norm=103.014, clip=100.000, loss_scale=1.000, optim_step_time=0.114, optim0_lr0=5.374e-04, train_time=4.141 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:24:26,859 (trainer:732) INFO: 3epoch:train:531-1060batch: iter_time=3.780e-04, forward_time=0.210, loss_att=267.329, acc=0.776, loss=267.329, backward_time=0.328, grad_norm=96.931, clip=100.000, loss_scale=1.000, optim_step_time=0.120, optim0_lr0=5.506e-04, train_time=2.888 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:30:46,708 (trainer:732) INFO: 3epoch:train:1061-1590batch: iter_time=3.556e-04, forward_time=0.208, loss_att=259.400, acc=0.777, loss=259.400, backward_time=0.324, grad_norm=95.530, clip=100.000, loss_scale=1.000, optim_step_time=0.117, optim0_lr0=5.638e-04, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:37:07,950 (trainer:732) INFO: 3epoch:train:1591-2120batch: iter_time=3.490e-04, forward_time=0.209, loss_att=256.172, acc=0.779, loss=256.172, backward_time=0.325, grad_norm=92.930, clip=100.000, loss_scale=1.000, optim_step_time=0.121, optim0_lr0=5.771e-04, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:43:27,834 (trainer:732) INFO: 3epoch:train:2121-2650batch: iter_time=3.487e-04, forward_time=0.208, loss_att=255.411, acc=0.781, loss=255.411, backward_time=0.325, grad_norm=97.235, clip=100.000, loss_scale=1.000, optim_step_time=0.111, optim0_lr0=5.904e-04, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:49:46,227 (trainer:732) INFO: 3epoch:train:2651-3180batch: iter_time=3.353e-04, forward_time=0.208, loss_att=259.197, acc=0.782, loss=259.197, backward_time=0.327, grad_norm=97.541, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=6.036e-04, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 01:56:04,634 (trainer:732) INFO: 3epoch:train:3181-3710batch: iter_time=3.374e-04, forward_time=0.209, loss_att=259.099, acc=0.782, loss=259.099, backward_time=0.327, grad_norm=101.754, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=6.169e-04, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:02:19,995 (trainer:732) INFO: 3epoch:train:3711-4240batch: iter_time=3.516e-04, forward_time=0.207, loss_att=245.298, acc=0.786, loss=245.298, backward_time=0.324, grad_norm=93.262, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=6.301e-04, train_time=2.831 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:08:36,736 (trainer:732) INFO: 3epoch:train:4241-4770batch: iter_time=3.558e-04, forward_time=0.208, loss_att=250.962, acc=0.786, loss=250.962, backward_time=0.326, grad_norm=100.203, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=6.433e-04, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:14:53,915 (trainer:732) INFO: 3epoch:train:4771-5300batch: iter_time=3.249e-04, forward_time=0.208, loss_att=246.312, acc=0.788, loss=246.312, backward_time=0.326, grad_norm=98.412, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=6.566e-04, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:21:10,448 (trainer:732) INFO: 3epoch:train:5301-5830batch: iter_time=3.482e-04, forward_time=0.207, loss_att=243.813, acc=0.790, loss=243.813, backward_time=0.325, grad_norm=91.642, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=6.699e-04, train_time=2.843 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:27:27,726 (trainer:732) INFO: 3epoch:train:5831-6360batch: iter_time=3.340e-04, forward_time=0.208, loss_att=241.549, acc=0.793, loss=241.549, backward_time=0.326, grad_norm=95.043, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=6.831e-04, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:33:47,258 (trainer:732) INFO: 3epoch:train:6361-6890batch: iter_time=3.356e-04, forward_time=0.209, loss_att=242.714, acc=0.794, loss=242.714, backward_time=0.328, grad_norm=99.027, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=6.963e-04, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:40:05,861 (trainer:732) INFO: 3epoch:train:6891-7420batch: iter_time=3.535e-04, forward_time=0.208, loss_att=237.596, acc=0.798, loss=237.596, backward_time=0.326, grad_norm=97.477, clip=100.000, loss_scale=1.000, optim_step_time=0.093, optim0_lr0=7.096e-04, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:46:24,404 (trainer:732) INFO: 3epoch:train:7421-7950batch: iter_time=3.520e-04, forward_time=0.209, loss_att=235.117, acc=0.802, loss=235.117, backward_time=0.327, grad_norm=97.936, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=7.229e-04, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:52:41,396 (trainer:732) INFO: 3epoch:train:7951-8480batch: iter_time=3.369e-04, forward_time=0.207, loss_att=230.370, acc=0.802, loss=230.370, backward_time=0.325, grad_norm=100.639, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=7.361e-04, train_time=2.843 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 02:58:59,945 (trainer:732) INFO: 3epoch:train:8481-9010batch: iter_time=3.462e-04, forward_time=0.208, loss_att=226.575, acc=0.806, loss=226.575, backward_time=0.326, grad_norm=99.755, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=7.493e-04, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:05:17,894 (trainer:732) INFO: 3epoch:train:9011-9540batch: iter_time=3.498e-04, forward_time=0.208, loss_att=223.901, acc=0.811, loss=223.901, backward_time=0.326, grad_norm=102.433, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=7.626e-04, train_time=2.850 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:11:33,029 (trainer:732) INFO: 3epoch:train:9541-10070batch: iter_time=3.572e-04, forward_time=0.206, loss_att=215.459, acc=0.812, loss=215.459, backward_time=0.323, grad_norm=94.881, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=7.758e-04, train_time=2.831 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:17:49,598 (trainer:732) INFO: 3epoch:train:10071-10600batch: iter_time=3.550e-04, forward_time=0.207, loss_att=218.948, acc=0.814, loss=218.948, backward_time=0.325, grad_norm=103.929, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=7.891e-04, train_time=2.841 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:25:54,734 (trainer:338) INFO: 3epoch results: [train] iter_time=5.319e-04, forward_time=0.208, loss_att=244.157, acc=0.792, loss=244.157, backward_time=0.326, grad_norm=97.967, clip=100.000, loss_scale=1.000, optim_step_time=0.096, optim0_lr0=6.634e-04, train_time=2.918, time=2 hours, 9 minutes and 11.71 seconds, total_count=31845, gpu_max_cached_mem_GB=30.438, [valid] loss_att=176.821, acc=0.843, cer=0.197, wer=0.475, loss=176.821, time=4 minutes and 9.31 seconds, total_count=36, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 37.75 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:26:00,135 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:26:00,137 (trainer:272) INFO: 4/60epoch started. Estimated time to finish: 5 days, 12 hours and 57 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:34:57,447 (trainer:732) INFO: 4epoch:train:1-530batch: iter_time=0.004, forward_time=0.208, loss_att=207.144, acc=0.820, loss=207.144, backward_time=0.325, grad_norm=99.428, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=8.026e-04, train_time=4.060 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:41:14,235 (trainer:732) INFO: 4epoch:train:531-1060batch: iter_time=3.451e-04, forward_time=0.208, loss_att=206.363, acc=0.822, loss=206.363, backward_time=0.326, grad_norm=97.367, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=8.159e-04, train_time=2.842 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:47:31,388 (trainer:732) INFO: 4epoch:train:1061-1590batch: iter_time=3.538e-04, forward_time=0.208, loss_att=200.495, acc=0.827, loss=200.495, backward_time=0.325, grad_norm=101.979, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=8.291e-04, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 03:53:49,492 (trainer:732) INFO: 4epoch:train:1591-2120batch: iter_time=3.343e-04, forward_time=0.208, loss_att=201.794, acc=0.829, loss=201.794, backward_time=0.326, grad_norm=101.885, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=8.424e-04, train_time=2.851 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:00:06,836 (trainer:732) INFO: 4epoch:train:2121-2650batch: iter_time=3.574e-04, forward_time=0.208, loss_att=197.741, acc=0.830, loss=197.741, backward_time=0.326, grad_norm=104.811, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=8.556e-04, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:06:23,788 (trainer:732) INFO: 4epoch:train:2651-3180batch: iter_time=3.531e-04, forward_time=0.207, loss_att=199.469, acc=0.831, loss=199.469, backward_time=0.325, grad_norm=107.604, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=8.689e-04, train_time=2.843 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:12:39,096 (trainer:732) INFO: 4epoch:train:3181-3710batch: iter_time=3.444e-04, forward_time=0.207, loss_att=189.102, acc=0.835, loss=189.102, backward_time=0.324, grad_norm=98.111, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=8.821e-04, train_time=2.833 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:18:56,734 (trainer:732) INFO: 4epoch:train:3711-4240batch: iter_time=3.425e-04, forward_time=0.208, loss_att=197.306, acc=0.834, loss=197.306, backward_time=0.326, grad_norm=106.202, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=8.954e-04, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:25:15,730 (trainer:732) INFO: 4epoch:train:4241-4770batch: iter_time=3.539e-04, forward_time=0.209, loss_att=191.636, acc=0.839, loss=191.636, backward_time=0.327, grad_norm=107.328, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=9.086e-04, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:31:33,689 (trainer:732) INFO: 4epoch:train:4771-5300batch: iter_time=3.469e-04, forward_time=0.208, loss_att=186.516, acc=0.841, loss=186.516, backward_time=0.326, grad_norm=105.231, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=9.219e-04, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:37:51,093 (trainer:732) INFO: 4epoch:train:5301-5830batch: iter_time=3.720e-04, forward_time=0.208, loss_att=185.340, acc=0.842, loss=185.340, backward_time=0.326, grad_norm=108.605, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=9.351e-04, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:44:10,720 (trainer:732) INFO: 4epoch:train:5831-6360batch: iter_time=3.452e-04, forward_time=0.209, loss_att=186.594, acc=0.843, loss=186.594, backward_time=0.328, grad_norm=111.085, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=9.484e-04, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:50:27,939 (trainer:732) INFO: 4epoch:train:6361-6890batch: iter_time=3.463e-04, forward_time=0.208, loss_att=182.243, acc=0.842, loss=182.243, backward_time=0.325, grad_norm=102.306, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=9.616e-04, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 04:56:45,254 (trainer:732) INFO: 4epoch:train:6891-7420batch: iter_time=3.506e-04, forward_time=0.208, loss_att=178.646, acc=0.847, loss=178.646, backward_time=0.326, grad_norm=104.093, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=9.749e-04, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:03:02,763 (trainer:732) INFO: 4epoch:train:7421-7950batch: iter_time=3.538e-04, forward_time=0.208, loss_att=181.066, acc=0.847, loss=181.066, backward_time=0.326, grad_norm=105.633, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=9.881e-04, train_time=2.850 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:09:21,181 (trainer:732) INFO: 4epoch:train:7951-8480batch: iter_time=3.371e-04, forward_time=0.208, loss_att=177.591, acc=0.851, loss=177.591, backward_time=0.327, grad_norm=109.970, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:15:37,228 (trainer:732) INFO: 4epoch:train:8481-9010batch: iter_time=3.450e-04, forward_time=0.208, loss_att=175.419, acc=0.851, loss=175.419, backward_time=0.325, grad_norm=105.462, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.840 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:21:54,829 (trainer:732) INFO: 4epoch:train:9011-9540batch: iter_time=3.421e-04, forward_time=0.208, loss_att=173.067, acc=0.852, loss=173.067, backward_time=0.325, grad_norm=104.117, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=0.001, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:28:14,167 (trainer:732) INFO: 4epoch:train:9541-10070batch: iter_time=3.494e-04, forward_time=0.209, loss_att=175.443, acc=0.852, loss=175.443, backward_time=0.327, grad_norm=108.375, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.864 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:34:31,948 (trainer:732) INFO: 4epoch:train:10071-10600batch: iter_time=3.245e-04, forward_time=0.208, loss_att=173.864, acc=0.854, loss=173.864, backward_time=0.326, grad_norm=107.769, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:42:37,381 (trainer:338) INFO: 4epoch results: [train] iter_time=5.381e-04, forward_time=0.208, loss_att=188.377, acc=0.839, loss=188.377, backward_time=0.326, grad_norm=104.866, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=9.287e-04, train_time=2.909, time=2 hours, 8 minutes and 50.15 seconds, total_count=42460, gpu_max_cached_mem_GB=30.438, [valid] loss_att=143.005, acc=0.874, cer=0.158, wer=0.404, loss=143.005, time=4 minutes and 4.68 seconds, total_count=48, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 42.41 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:42:42,950 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:42:42,952 (trainer:272) INFO: 5/60epoch started. Estimated time to finish: 5 days, 9 hours and 52 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:51:44,035 (trainer:732) INFO: 5epoch:train:1-530batch: iter_time=0.004, forward_time=0.209, loss_att=167.396, acc=0.858, loss=167.396, backward_time=0.326, grad_norm=109.812, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=4.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 05:58:01,081 (trainer:732) INFO: 5epoch:train:531-1060batch: iter_time=3.505e-04, forward_time=0.208, loss_att=167.386, acc=0.857, loss=167.386, backward_time=0.325, grad_norm=103.715, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:04:19,817 (trainer:732) INFO: 5epoch:train:1061-1590batch: iter_time=3.445e-04, forward_time=0.209, loss_att=165.457, acc=0.859, loss=165.457, backward_time=0.327, grad_norm=110.339, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.859 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:10:36,616 (trainer:732) INFO: 5epoch:train:1591-2120batch: iter_time=3.445e-04, forward_time=0.207, loss_att=162.915, acc=0.860, loss=162.915, backward_time=0.325, grad_norm=106.971, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.842 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:16:55,440 (trainer:732) INFO: 5epoch:train:2121-2650batch: iter_time=3.504e-04, forward_time=0.210, loss_att=164.179, acc=0.861, loss=164.179, backward_time=0.328, grad_norm=110.941, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:23:14,774 (trainer:732) INFO: 5epoch:train:2651-3180batch: iter_time=3.395e-04, forward_time=0.209, loss_att=163.369, acc=0.862, loss=163.369, backward_time=0.327, grad_norm=111.104, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.862 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:29:30,814 (trainer:732) INFO: 5epoch:train:3181-3710batch: iter_time=3.350e-04, forward_time=0.207, loss_att=161.116, acc=0.862, loss=161.116, backward_time=0.325, grad_norm=108.500, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.839 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:35:47,555 (trainer:732) INFO: 5epoch:train:3711-4240batch: iter_time=3.385e-04, forward_time=0.207, loss_att=160.800, acc=0.862, loss=160.800, backward_time=0.325, grad_norm=107.220, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.841 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:42:03,572 (trainer:732) INFO: 5epoch:train:4241-4770batch: iter_time=3.424e-04, forward_time=0.207, loss_att=155.098, acc=0.865, loss=155.098, backward_time=0.325, grad_norm=107.256, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.838 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:48:19,967 (trainer:732) INFO: 5epoch:train:4771-5300batch: iter_time=3.446e-04, forward_time=0.208, loss_att=158.089, acc=0.864, loss=158.089, backward_time=0.325, grad_norm=104.490, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.839 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 06:54:37,988 (trainer:732) INFO: 5epoch:train:5301-5830batch: iter_time=3.516e-04, forward_time=0.208, loss_att=156.874, acc=0.865, loss=156.874, backward_time=0.326, grad_norm=112.148, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:00:55,948 (trainer:732) INFO: 5epoch:train:5831-6360batch: iter_time=3.413e-04, forward_time=0.208, loss_att=155.724, acc=0.867, loss=155.724, backward_time=0.326, grad_norm=106.515, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.851 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:07:13,169 (trainer:732) INFO: 5epoch:train:6361-6890batch: iter_time=3.333e-04, forward_time=0.209, loss_att=155.617, acc=0.866, loss=155.617, backward_time=0.326, grad_norm=104.423, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:13:31,296 (trainer:732) INFO: 5epoch:train:6891-7420batch: iter_time=3.422e-04, forward_time=0.209, loss_att=153.844, acc=0.869, loss=153.844, backward_time=0.327, grad_norm=108.723, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:19:50,835 (trainer:732) INFO: 5epoch:train:7421-7950batch: iter_time=3.411e-04, forward_time=0.209, loss_att=153.747, acc=0.869, loss=153.747, backward_time=0.327, grad_norm=109.795, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:26:07,137 (trainer:732) INFO: 5epoch:train:7951-8480batch: iter_time=3.483e-04, forward_time=0.207, loss_att=150.004, acc=0.871, loss=150.004, backward_time=0.324, grad_norm=108.360, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.838 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:32:26,077 (trainer:732) INFO: 5epoch:train:8481-9010batch: iter_time=3.545e-04, forward_time=0.209, loss_att=153.204, acc=0.871, loss=153.204, backward_time=0.327, grad_norm=105.995, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:38:45,023 (trainer:732) INFO: 5epoch:train:9011-9540batch: iter_time=3.519e-04, forward_time=0.209, loss_att=152.125, acc=0.871, loss=152.125, backward_time=0.327, grad_norm=109.933, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=0.001, train_time=2.859 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:45:01,616 (trainer:732) INFO: 5epoch:train:9541-10070batch: iter_time=3.400e-04, forward_time=0.207, loss_att=149.076, acc=0.872, loss=149.076, backward_time=0.325, grad_norm=112.889, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.843 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:51:18,945 (trainer:732) INFO: 5epoch:train:10071-10600batch: iter_time=3.383e-04, forward_time=0.208, loss_att=148.694, acc=0.873, loss=148.694, backward_time=0.326, grad_norm=105.940, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:59:15,565 (trainer:338) INFO: 5epoch results: [train] iter_time=5.327e-04, forward_time=0.208, loss_att=157.732, acc=0.865, loss=157.732, backward_time=0.326, grad_norm=108.277, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.911, time=2 hours, 8 minutes and 56.32 seconds, total_count=53075, gpu_max_cached_mem_GB=30.438, [valid] loss_att=122.324, acc=0.892, cer=0.137, wer=0.356, loss=122.324, time=3 minutes and 54.47 seconds, total_count=60, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 41.82 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:59:21,025 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 07:59:21,028 (trainer:272) INFO: 6/60epoch started. Estimated time to finish: 5 days, 7 hours and 5 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:08:23,934 (trainer:732) INFO: 6epoch:train:1-530batch: iter_time=0.003, forward_time=0.210, loss_att=149.881, acc=0.876, loss=149.881, backward_time=0.328, grad_norm=112.216, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=4.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:14:42,038 (trainer:732) INFO: 6epoch:train:531-1060batch: iter_time=3.445e-04, forward_time=0.209, loss_att=143.839, acc=0.877, loss=143.839, backward_time=0.326, grad_norm=111.565, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:21:00,881 (trainer:732) INFO: 6epoch:train:1061-1590batch: iter_time=3.371e-04, forward_time=0.210, loss_att=143.669, acc=0.877, loss=143.669, backward_time=0.327, grad_norm=109.578, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:27:20,472 (trainer:732) INFO: 6epoch:train:1591-2120batch: iter_time=3.563e-04, forward_time=0.209, loss_att=145.539, acc=0.877, loss=145.539, backward_time=0.327, grad_norm=114.398, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:33:35,281 (trainer:732) INFO: 6epoch:train:2121-2650batch: iter_time=3.424e-04, forward_time=0.206, loss_att=141.482, acc=0.877, loss=141.482, backward_time=0.324, grad_norm=106.856, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.829 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:39:52,665 (trainer:732) INFO: 6epoch:train:2651-3180batch: iter_time=3.425e-04, forward_time=0.208, loss_att=142.328, acc=0.878, loss=142.328, backward_time=0.325, grad_norm=111.574, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:46:10,862 (trainer:732) INFO: 6epoch:train:3181-3710batch: iter_time=3.505e-04, forward_time=0.209, loss_att=141.641, acc=0.879, loss=141.641, backward_time=0.326, grad_norm=107.487, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:52:27,574 (trainer:732) INFO: 6epoch:train:3711-4240batch: iter_time=3.457e-04, forward_time=0.207, loss_att=140.599, acc=0.879, loss=140.599, backward_time=0.325, grad_norm=105.118, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.841 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 08:58:45,406 (trainer:732) INFO: 6epoch:train:4241-4770batch: iter_time=3.594e-04, forward_time=0.209, loss_att=140.368, acc=0.880, loss=140.368, backward_time=0.326, grad_norm=108.845, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:05:03,308 (trainer:732) INFO: 6epoch:train:4771-5300batch: iter_time=3.552e-04, forward_time=0.209, loss_att=140.570, acc=0.881, loss=140.570, backward_time=0.326, grad_norm=113.507, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=2.851 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:11:20,274 (trainer:732) INFO: 6epoch:train:5301-5830batch: iter_time=3.407e-04, forward_time=0.208, loss_att=137.558, acc=0.881, loss=137.558, backward_time=0.325, grad_norm=101.988, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:17:37,809 (trainer:732) INFO: 6epoch:train:5831-6360batch: iter_time=3.539e-04, forward_time=0.209, loss_att=136.888, acc=0.881, loss=136.888, backward_time=0.326, grad_norm=107.420, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:23:54,255 (trainer:732) INFO: 6epoch:train:6361-6890batch: iter_time=3.469e-04, forward_time=0.207, loss_att=135.865, acc=0.883, loss=135.865, backward_time=0.325, grad_norm=105.107, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.842 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:30:13,867 (trainer:732) INFO: 6epoch:train:6891-7420batch: iter_time=3.404e-04, forward_time=0.209, loss_att=137.144, acc=0.884, loss=137.144, backward_time=0.328, grad_norm=111.080, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:36:31,632 (trainer:732) INFO: 6epoch:train:7421-7950batch: iter_time=3.451e-04, forward_time=0.209, loss_att=137.644, acc=0.883, loss=137.644, backward_time=0.326, grad_norm=103.272, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:42:48,704 (trainer:732) INFO: 6epoch:train:7951-8480batch: iter_time=3.449e-04, forward_time=0.207, loss_att=132.244, acc=0.884, loss=132.244, backward_time=0.325, grad_norm=107.562, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:49:06,202 (trainer:732) INFO: 6epoch:train:8481-9010batch: iter_time=3.617e-04, forward_time=0.208, loss_att=135.764, acc=0.883, loss=135.764, backward_time=0.325, grad_norm=111.433, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=0.002, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 09:55:26,403 (trainer:732) INFO: 6epoch:train:9011-9540batch: iter_time=3.396e-04, forward_time=0.210, loss_att=135.876, acc=0.886, loss=135.876, backward_time=0.328, grad_norm=109.069, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:01:43,483 (trainer:732) INFO: 6epoch:train:9541-10070batch: iter_time=3.424e-04, forward_time=0.208, loss_att=134.671, acc=0.884, loss=134.671, backward_time=0.326, grad_norm=111.236, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:08:00,085 (trainer:732) INFO: 6epoch:train:10071-10600batch: iter_time=3.542e-04, forward_time=0.208, loss_att=131.583, acc=0.886, loss=131.583, backward_time=0.325, grad_norm=112.377, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.841 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:15:55,583 (trainer:338) INFO: 6epoch results: [train] iter_time=4.615e-04, forward_time=0.209, loss_att=139.221, acc=0.881, loss=139.221, backward_time=0.326, grad_norm=109.077, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.912, time=2 hours, 8 minutes and 57.31 seconds, total_count=63690, gpu_max_cached_mem_GB=30.438, [valid] loss_att=109.506, acc=0.903, cer=0.123, wer=0.330, loss=109.506, time=3 minutes and 57.55 seconds, total_count=72, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 39.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:16:00,966 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:16:00,969 (trainer:272) INFO: 7/60epoch started. Estimated time to finish: 5 days, 4 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:24:58,942 (trainer:732) INFO: 7epoch:train:1-530batch: iter_time=0.004, forward_time=0.209, loss_att=128.932, acc=0.889, loss=128.932, backward_time=0.326, grad_norm=110.444, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=4.065 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:31:15,860 (trainer:732) INFO: 7epoch:train:531-1060batch: iter_time=3.617e-04, forward_time=0.208, loss_att=131.318, acc=0.887, loss=131.318, backward_time=0.325, grad_norm=108.355, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:37:32,905 (trainer:732) INFO: 7epoch:train:1061-1590batch: iter_time=3.581e-04, forward_time=0.208, loss_att=129.844, acc=0.890, loss=129.844, backward_time=0.326, grad_norm=111.071, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:43:53,295 (trainer:732) INFO: 7epoch:train:1591-2120batch: iter_time=3.254e-04, forward_time=0.209, loss_att=132.587, acc=0.889, loss=132.587, backward_time=0.328, grad_norm=110.395, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:50:09,209 (trainer:732) INFO: 7epoch:train:2121-2650batch: iter_time=3.430e-04, forward_time=0.207, loss_att=127.107, acc=0.889, loss=127.107, backward_time=0.325, grad_norm=109.123, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.837 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 10:56:27,530 (trainer:732) INFO: 7epoch:train:2651-3180batch: iter_time=3.422e-04, forward_time=0.208, loss_att=128.340, acc=0.890, loss=128.340, backward_time=0.327, grad_norm=107.299, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:02:44,803 (trainer:732) INFO: 7epoch:train:3181-3710batch: iter_time=3.581e-04, forward_time=0.208, loss_att=127.735, acc=0.889, loss=127.735, backward_time=0.325, grad_norm=111.505, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:09:03,203 (trainer:732) INFO: 7epoch:train:3711-4240batch: iter_time=3.392e-04, forward_time=0.207, loss_att=131.029, acc=0.888, loss=131.029, backward_time=0.326, grad_norm=111.429, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:15:18,457 (trainer:732) INFO: 7epoch:train:4241-4770batch: iter_time=3.486e-04, forward_time=0.207, loss_att=128.556, acc=0.889, loss=128.556, backward_time=0.324, grad_norm=101.360, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.832 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:21:37,075 (trainer:732) INFO: 7epoch:train:4771-5300batch: iter_time=3.431e-04, forward_time=0.209, loss_att=125.847, acc=0.892, loss=125.847, backward_time=0.327, grad_norm=108.171, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.856 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:27:53,752 (trainer:732) INFO: 7epoch:train:5301-5830batch: iter_time=3.462e-04, forward_time=0.208, loss_att=126.625, acc=0.891, loss=126.625, backward_time=0.326, grad_norm=108.185, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:34:10,507 (trainer:732) INFO: 7epoch:train:5831-6360batch: iter_time=3.412e-04, forward_time=0.207, loss_att=126.597, acc=0.890, loss=126.597, backward_time=0.325, grad_norm=104.221, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.842 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:40:27,818 (trainer:732) INFO: 7epoch:train:6361-6890batch: iter_time=3.444e-04, forward_time=0.208, loss_att=127.759, acc=0.891, loss=127.759, backward_time=0.326, grad_norm=108.135, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:46:45,236 (trainer:732) INFO: 7epoch:train:6891-7420batch: iter_time=3.563e-04, forward_time=0.207, loss_att=124.532, acc=0.893, loss=124.532, backward_time=0.326, grad_norm=107.001, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:53:00,629 (trainer:732) INFO: 7epoch:train:7421-7950batch: iter_time=3.555e-04, forward_time=0.207, loss_att=122.760, acc=0.892, loss=122.760, backward_time=0.324, grad_norm=103.725, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.834 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 11:59:18,956 (trainer:732) INFO: 7epoch:train:7951-8480batch: iter_time=3.639e-04, forward_time=0.208, loss_att=124.324, acc=0.893, loss=124.324, backward_time=0.326, grad_norm=109.678, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.853 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:05:34,880 (trainer:732) INFO: 7epoch:train:8481-9010batch: iter_time=3.593e-04, forward_time=0.207, loss_att=121.757, acc=0.894, loss=121.757, backward_time=0.324, grad_norm=110.811, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.838 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:11:52,964 (trainer:732) INFO: 7epoch:train:9011-9540batch: iter_time=3.450e-04, forward_time=0.208, loss_att=127.203, acc=0.893, loss=127.203, backward_time=0.327, grad_norm=108.736, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:18:13,142 (trainer:732) INFO: 7epoch:train:9541-10070batch: iter_time=3.453e-04, forward_time=0.209, loss_att=126.175, acc=0.894, loss=126.175, backward_time=0.328, grad_norm=110.525, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:24:30,709 (trainer:732) INFO: 7epoch:train:10071-10600batch: iter_time=3.490e-04, forward_time=0.208, loss_att=123.971, acc=0.894, loss=123.971, backward_time=0.327, grad_norm=109.949, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:32:21,752 (trainer:338) INFO: 7epoch results: [train] iter_time=5.086e-04, forward_time=0.208, loss_att=127.134, acc=0.891, loss=127.134, backward_time=0.326, grad_norm=108.536, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.909, time=2 hours, 8 minutes and 48.13 seconds, total_count=74305, gpu_max_cached_mem_GB=30.438, [valid] loss_att=102.969, acc=0.910, cer=0.114, wer=0.312, loss=102.969, time=3 minutes and 54.69 seconds, total_count=84, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 37.96 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:32:27,500 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:32:27,503 (trainer:272) INFO: 8/60epoch started. Estimated time to finish: 5 days, 1 hour and 56 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:41:28,385 (trainer:732) INFO: 8epoch:train:1-530batch: iter_time=0.003, forward_time=0.209, loss_att=117.479, acc=0.897, loss=117.479, backward_time=0.325, grad_norm=105.230, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=4.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:47:45,763 (trainer:732) INFO: 8epoch:train:531-1060batch: iter_time=3.463e-04, forward_time=0.209, loss_att=121.268, acc=0.897, loss=121.268, backward_time=0.326, grad_norm=114.405, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 12:54:04,722 (trainer:732) INFO: 8epoch:train:1061-1590batch: iter_time=3.541e-04, forward_time=0.210, loss_att=121.798, acc=0.896, loss=121.798, backward_time=0.327, grad_norm=109.557, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:00:22,193 (trainer:732) INFO: 8epoch:train:1591-2120batch: iter_time=3.368e-04, forward_time=0.209, loss_att=120.599, acc=0.896, loss=120.599, backward_time=0.326, grad_norm=106.103, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:06:40,976 (trainer:732) INFO: 8epoch:train:2121-2650batch: iter_time=3.363e-04, forward_time=0.209, loss_att=122.559, acc=0.897, loss=122.559, backward_time=0.327, grad_norm=110.892, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:12:58,811 (trainer:732) INFO: 8epoch:train:2651-3180batch: iter_time=3.576e-04, forward_time=0.208, loss_att=120.449, acc=0.897, loss=120.449, backward_time=0.326, grad_norm=113.361, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.850 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:19:17,935 (trainer:732) INFO: 8epoch:train:3181-3710batch: iter_time=3.494e-04, forward_time=0.210, loss_att=120.046, acc=0.899, loss=120.046, backward_time=0.328, grad_norm=107.201, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:25:34,947 (trainer:732) INFO: 8epoch:train:3711-4240batch: iter_time=3.548e-04, forward_time=0.208, loss_att=119.564, acc=0.898, loss=119.564, backward_time=0.326, grad_norm=104.426, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:31:52,049 (trainer:732) INFO: 8epoch:train:4241-4770batch: iter_time=3.678e-04, forward_time=0.208, loss_att=116.923, acc=0.898, loss=116.923, backward_time=0.325, grad_norm=108.727, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:38:09,076 (trainer:732) INFO: 8epoch:train:4771-5300batch: iter_time=3.517e-04, forward_time=0.208, loss_att=118.649, acc=0.897, loss=118.649, backward_time=0.325, grad_norm=107.399, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:44:26,448 (trainer:732) INFO: 8epoch:train:5301-5830batch: iter_time=3.447e-04, forward_time=0.209, loss_att=119.559, acc=0.897, loss=119.559, backward_time=0.326, grad_norm=110.157, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:50:44,156 (trainer:732) INFO: 8epoch:train:5831-6360batch: iter_time=3.419e-04, forward_time=0.208, loss_att=116.734, acc=0.899, loss=116.734, backward_time=0.326, grad_norm=118.513, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 13:57:00,866 (trainer:732) INFO: 8epoch:train:6361-6890batch: iter_time=3.332e-04, forward_time=0.208, loss_att=118.196, acc=0.899, loss=118.196, backward_time=0.326, grad_norm=113.309, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:03:16,215 (trainer:732) INFO: 8epoch:train:6891-7420batch: iter_time=3.584e-04, forward_time=0.207, loss_att=116.080, acc=0.898, loss=116.080, backward_time=0.324, grad_norm=107.178, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.831 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:09:32,172 (trainer:732) INFO: 8epoch:train:7421-7950batch: iter_time=3.582e-04, forward_time=0.208, loss_att=116.367, acc=0.899, loss=116.367, backward_time=0.325, grad_norm=105.776, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.839 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:15:49,507 (trainer:732) INFO: 8epoch:train:7951-8480batch: iter_time=3.699e-04, forward_time=0.208, loss_att=116.785, acc=0.900, loss=116.785, backward_time=0.326, grad_norm=102.275, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.845 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:22:06,296 (trainer:732) INFO: 8epoch:train:8481-9010batch: iter_time=3.545e-04, forward_time=0.208, loss_att=114.661, acc=0.901, loss=114.661, backward_time=0.325, grad_norm=109.088, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.844 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:28:24,801 (trainer:732) INFO: 8epoch:train:9011-9540batch: iter_time=3.471e-04, forward_time=0.209, loss_att=116.464, acc=0.902, loss=116.464, backward_time=0.327, grad_norm=104.016, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:34:43,095 (trainer:732) INFO: 8epoch:train:9541-10070batch: iter_time=3.683e-04, forward_time=0.209, loss_att=114.897, acc=0.901, loss=114.897, backward_time=0.327, grad_norm=106.041, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.856 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:40:59,415 (trainer:732) INFO: 8epoch:train:10071-10600batch: iter_time=3.540e-04, forward_time=0.208, loss_att=115.707, acc=0.900, loss=115.707, backward_time=0.325, grad_norm=105.118, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.839 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:48:53,648 (trainer:338) INFO: 8epoch results: [train] iter_time=5.055e-04, forward_time=0.208, loss_att=118.194, acc=0.898, loss=118.194, backward_time=0.326, grad_norm=108.431, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.910, time=2 hours, 8 minutes and 50.79 seconds, total_count=84920, gpu_max_cached_mem_GB=30.438, [valid] loss_att=96.275, acc=0.915, cer=0.107, wer=0.295, loss=96.275, time=3 minutes and 58.58 seconds, total_count=96, gpu_max_cached_mem_GB=30.438, [att_plot] time=3 minutes and 36.78 seconds, total_count=0, gpu_max_cached_mem_GB=30.438 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:48:59,363 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:48:59,367 (trainer:272) INFO: 9/60epoch started. Estimated time to finish: 4 days, 23 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 14:57:56,638 (trainer:732) INFO: 9epoch:train:1-530batch: iter_time=0.003, forward_time=0.209, loss_att=108.177, acc=0.904, loss=108.177, backward_time=0.325, grad_norm=102.611, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=4.060 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 15:04:14,256 (trainer:732) INFO: 9epoch:train:531-1060batch: iter_time=3.685e-04, forward_time=0.208, loss_att=113.481, acc=0.902, loss=113.481, backward_time=0.326, grad_norm=106.611, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 15:10:31,511 (trainer:732) INFO: 9epoch:train:1061-1590batch: iter_time=3.445e-04, forward_time=0.208, loss_att=109.032, acc=0.905, loss=109.032, backward_time=0.326, grad_norm=110.049, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 15:16:49,762 (trainer:732) INFO: 9epoch:train:1591-2120batch: iter_time=3.790e-04, forward_time=0.208, loss_att=110.022, acc=0.905, loss=110.022, backward_time=0.326, grad_norm=99.725, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.002, train_time=2.853 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 15:23:07,471 (trainer:732) INFO: 9epoch:train:2121-2650batch: iter_time=3.419e-04, forward_time=0.208, loss_att=110.447, acc=0.905, loss=110.447, backward_time=0.327, grad_norm=111.642, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.851 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-19 15:29:25,642 (trainer:732) INFO: 9epoch:train:2651-3180batch: iter_time=3.427e-04, forward_time=0.208, loss_att=111.232, acc=0.905, loss=111.232, backward_time=0.326, grad_norm=103.395, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.852 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<64896> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<15169> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<50096> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<47399> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<17333> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<17343> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<26297> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 137) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 137) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<26533> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 137) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 137) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045484:2049225 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<47948> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<48142> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<37642> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<25384> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<40890> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<40896> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<20556> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<20828> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 138) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045483:2049226 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<32459> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<32773> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<40540> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<45231> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<26529> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<30567> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<30569> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 149) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<31394> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<34298> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045481:2049224 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<26701> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<50503> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<21872> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<41758> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<41772> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<59964> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 135) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<63494> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2045482:2049227 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 134) +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGKILL +# Accounting: time=69006 threads=1 +# Ended (code 1) at Sun Nov 19 15:34:37 CST 2023, elapsed time 69006 seconds +/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 224 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.3.log b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.3.log new file mode 100644 index 0000000000000000000000000000000000000000..e019a230d62780cdceae5134afb7475ce8ed1407 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.3.log @@ -0,0 +1,1057 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Sat Nov 18 19:32:54 CST 2023 +# +/star-home/jinzengrui/lib/miniconda3/envs/dev39/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:14,486 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:14,486 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:14,530 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,927 (abs_task:1229) INFO: pytorch.version=1.11.0+cu102, cuda.available=True, cudnn.version=7605, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,941 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,942 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + eps: 1e-08 + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,942 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,945 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:22,967 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:25,928 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,548 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_medium_kaldi_fmt_sp/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_medium_kaldi_fmt_sp/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,548 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=9484, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,552 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=9484, mean=41.8, min=10, max=170 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,657 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,665 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,665 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=11, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,665 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=11, mean=41.1, min=22, max=66 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,671 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,697 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,697 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=452, batch_size=1, key_file=exp/asr_stats_raw_en_char_sp/valid/speech_shape, +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:32,697 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda10.2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1753233 [1] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1753234 [2] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1753233 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1753234 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1753235 [3] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1753235 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1753233 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1753233 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1753233 [1] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1753234 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1753234 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1753234 [2] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1753235 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1753235 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1753235 [3] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Setting affinity for GPU 1 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Setting affinity for GPU 2 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Setting affinity for GPU 5 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Setting affinity for GPU 0 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Channel 00 : 2[7000] -> 3[e000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Channel 01 : 2[7000] -> 3[e000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Channel 00 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Channel 01 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Channel 00 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Channel 01 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Channel 00 : 3[e000] -> 0[4000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Channel 01 : 3[e000] -> 0[4000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Channel 00 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Channel 01 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Channel 00 : 3[e000] -> 2[7000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Channel 01 : 3[e000] -> 2[7000] via direct shared memory +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Channel 00 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Channel 01 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753233:1756016 [1] NCCL INFO comm 0x7fc0a8001200 rank 1 nranks 4 cudaDev 1 busId 6000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753235:1756018 [3] NCCL INFO comm 0x7f483c001200 rank 3 nranks 4 cudaDev 3 busId e000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1756014 [0] NCCL INFO comm 0x7f8d5c001200 rank 0 nranks 4 cudaDev 0 busId 4000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753232:1753232 [0] NCCL INFO Launch mode Parallel +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:1753234:1756017 [2] NCCL INFO comm 0x7fbe0c001200 rank 2 nranks 4 cudaDev 2 busId 7000 - Init COMPLETE +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-18 19:33:34,575 (trainer:284) INFO: 1/60epoch started +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code +Traceback (most recent call last): + File "", line 1, in +Traceback (most recent call last): + File "", line 1, in +Traceback (most recent call last): +Traceback (most recent call last): + File "", line 1, in + File "", line 1, in + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/connection.py", line 931, in wait + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/selectors.py", line 416, in select + exitcode = _main(fd, parent_sentinel) + exitcode = _main(fd, parent_sentinel) + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + prepare(preparation_data) + prepare(preparation_data) + prepare(preparation_data) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + prepare(preparation_data) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt + _fixup_main_from_name(data['init_main_from_name']) + _fixup_main_from_name(data['init_main_from_name']) + _fixup_main_from_name(data['init_main_from_name']) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + _fixup_main_from_name(data['init_main_from_name']) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + main_content = runpy.run_module(mod_name, + main_content = runpy.run_module(mod_name, + main_content = runpy.run_module(mod_name, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 225, in run_module + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 225, in run_module + main_content = runpy.run_module(mod_name, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 225, in run_module + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 225, in run_module + return _run_module_code(code, init_globals, run_name, mod_spec) + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 97, in _run_module_code + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 97, in _run_module_code + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 97, in _run_module_code + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 97, in _run_module_code + _run_code(code, mod_globals, init_globals, + _run_code(code, mod_globals, init_globals, + _run_code(code, mod_globals, init_globals, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + _run_code(code, mod_globals, init_globals, + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + exec(code, run_globals) + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + from espnet2.tasks.asr import ASRTask + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 73, in + from espnet2.tasks.asr import ASRTask + from espnet2.tasks.asr import ASRTask + from espnet2.tasks.asr import ASRTask + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 73, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 73, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 73, in + from espnet2.tasks.abs_task import AbsTask + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 48, in + from espnet2.tasks.abs_task import AbsTask + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 48, in + from espnet2.tasks.abs_task import AbsTask + from espnet2.tasks.abs_task import AbsTask + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 48, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 48, in + from espnet2.train.dataset import DATA_TYPES, AbsDataset, ESPnetDataset + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/dataset.py", line 10, in + from espnet2.train.dataset import DATA_TYPES, AbsDataset, ESPnetDataset + from espnet2.train.dataset import DATA_TYPES, AbsDataset, ESPnetDataset + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/dataset.py", line 10, in + from espnet2.train.dataset import DATA_TYPES, AbsDataset, ESPnetDataset + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/dataset.py", line 10, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/dataset.py", line 10, in + import h5py + import h5py + import h5py + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/__init__.py", line 58, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/__init__.py", line 58, in + import h5py + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/__init__.py", line 58, in + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/__init__.py", line 58, in + from ._hl import filters + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/_hl/filters.py", line 44, in + from ._hl import filters + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/_hl/filters.py", line 44, in + from ._hl import filters + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/_hl/filters.py", line 44, in + from ._hl import filters + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/h5py/_hl/filters.py", line 44, in + from .base import product + File "", line 1007, in _find_and_load + from .base import product + File "", line 1007, in _find_and_load + from .base import product + File "", line 1007, in _find_and_load + from .base import product + File "", line 1007, in _find_and_load + File "", line 986, in _find_and_load_unlocked + File "", line 986, in _find_and_load_unlocked + File "", line 986, in _find_and_load_unlocked + File "", line 986, in _find_and_load_unlocked + File "", line 680, in _load_unlocked + File "", line 680, in _load_unlocked + File "", line 680, in _load_unlocked + File "", line 846, in exec_module + File "", line 680, in _load_unlocked + File "", line 846, in exec_module + File "", line 846, in exec_module + File "", line 941, in get_code + File "", line 941, in get_code + File "", line 941, in get_code + File "", line 1039, in get_data + File "", line 1039, in get_data + File "", line 846, in exec_module +KeyboardInterrupt + File "", line 1039, in get_data + File "", line 941, in get_code +KeyboardInterrupt +KeyboardInterrupt + File "", line 1039, in get_data +KeyboardInterrupt +Process SpawnProcess-3: +Process SpawnProcess-1: +Process SpawnProcess-4: +Process SpawnProcess-2: +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 368, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 368, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 314, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 368, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 314, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 927, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 368, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 314, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 927, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 314, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 927, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 927, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 62, in _launch + f.write(fp.getbuffer()) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 62, in _launch + f.write(fp.getbuffer()) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 62, in _launch + f.write(fp.getbuffer()) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 62, in _launch + f.write(fp.getbuffer()) +KeyboardInterrupt +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__ +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__ +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__ + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers + self._shutdown_workers() + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__ + if self._persistent_workers or self._workers_status[worker_id]: + if self._persistent_workers or self._workers_status[worker_id]: + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log new file mode 100644 index 0000000000000000000000000000000000000000..1e84e62c7e46e5f17bb73e32f95db5ac15f6ca8a --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log @@ -0,0 +1,4834 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Tue Nov 21 11:11:35 CST 2023 +# +/star-home/jinzengrui/lib/miniconda3/envs/dev39/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char_sp/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_medium_kaldi_fmt_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char_sp/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char_sp/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:11:57,012 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:11:57,012 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:11:57,049 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,275 (abs_task:1229) INFO: pytorch.version=1.11.0+cu102, cuda.available=True, cudnn.version=7605, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,292 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char_sp/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,293 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + eps: 1e-08 + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,293 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,295 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:05,321 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:09,414 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:19,856 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_medium_kaldi_fmt_sp/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_medium_kaldi_fmt_sp/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:19,856 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=10615, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:19,861 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=10615, mean=53.4, min=7, max=201 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,026 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,035 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,036 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=12, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,036 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=12, mean=50.4, min=17, max=82 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,041 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,069 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,069 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=605, batch_size=1, key_file=exp/asr_stats_raw_en_char_sp/valid/speech_shape, +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:20,069 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:21,835 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda10.2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992218 [3] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992217 [2] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992218 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992217 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992218 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992218 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992218 [3] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992217 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992217 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992217 [2] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992216 [1] NCCL INFO Bootstrap : Using eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992216 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992216 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992216 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.13.150<0> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992216 [1] NCCL INFO Using network Socket +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Setting affinity for GPU 1 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Setting affinity for GPU 2 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Setting affinity for GPU 3 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Setting affinity for GPU 0 to 20000002 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Channel 00 : 2[7000] -> 3[8000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Channel 00 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Channel 01 : 2[7000] -> 3[8000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Channel 01 : 1[6000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Channel 00 : 3[8000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Channel 01 : 3[8000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Channel 00 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Channel 01 : 0[4000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Channel 00 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Channel 01 : 1[6000] -> 0[4000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Channel 00 : 3[8000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Channel 01 : 3[8000] -> 2[7000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Connected all rings +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Channel 00 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Channel 01 : 2[7000] -> 1[6000] via P2P/IPC +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO Connected all trees +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992345 [1] NCCL INFO comm 0x7f1f84001200 rank 1 nranks 4 cudaDev 1 busId 6000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992343 [3] NCCL INFO comm 0x7f6518001200 rank 3 nranks 4 cudaDev 3 busId 8000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992342 [0] NCCL INFO comm 0x7ff654001200 rank 0 nranks 4 cudaDev 0 busId 4000 - Init COMPLETE +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992215 [0] NCCL INFO Launch mode Parallel +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992344 [2] NCCL INFO comm 0x7f8f24001200 rank 2 nranks 4 cudaDev 2 busId 7000 - Init COMPLETE +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:12:25,623 (trainer:284) INFO: 9/60epoch started +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:16:54,816 (distributed:948) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:23:30,611 (trainer:732) INFO: 9epoch:train:1-530batch: iter_time=0.003, forward_time=0.243, loss_att=107.878, acc=0.904, loss=107.878, backward_time=0.326, grad_norm=103.272, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=5.026 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:30:10,861 (trainer:732) INFO: 9epoch:train:531-1060batch: iter_time=3.925e-04, forward_time=0.232, loss_att=112.672, acc=0.903, loss=112.672, backward_time=0.328, grad_norm=105.518, clip=100.000, loss_scale=1.000, optim_step_time=0.106, optim0_lr0=0.002, train_time=3.019 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:37:16,495 (trainer:732) INFO: 9epoch:train:1061-1590batch: iter_time=6.724e-04, forward_time=0.244, loss_att=109.250, acc=0.905, loss=109.250, backward_time=0.336, grad_norm=112.951, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=0.002, train_time=3.212 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:44:17,351 (trainer:732) INFO: 9epoch:train:1591-2120batch: iter_time=5.168e-04, forward_time=0.243, loss_att=109.274, acc=0.906, loss=109.274, backward_time=0.340, grad_norm=104.180, clip=100.000, loss_scale=1.000, optim_step_time=0.155, optim0_lr0=0.002, train_time=3.176 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:50:55,846 (trainer:732) INFO: 9epoch:train:2121-2650batch: iter_time=3.850e-04, forward_time=0.224, loss_att=110.221, acc=0.905, loss=110.221, backward_time=0.329, grad_norm=110.200, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=0.002, train_time=3.005 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 11:57:32,032 (trainer:732) INFO: 9epoch:train:2651-3180batch: iter_time=3.700e-04, forward_time=0.222, loss_att=111.572, acc=0.905, loss=111.572, backward_time=0.329, grad_norm=102.797, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.002, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:04:04,565 (trainer:732) INFO: 9epoch:train:3181-3710batch: iter_time=3.615e-04, forward_time=0.217, loss_att=108.396, acc=0.906, loss=108.396, backward_time=0.328, grad_norm=103.265, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.962 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:10:36,127 (trainer:732) INFO: 9epoch:train:3711-4240batch: iter_time=3.590e-04, forward_time=0.214, loss_att=105.457, acc=0.907, loss=105.457, backward_time=0.326, grad_norm=105.516, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=2.955 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:17:09,208 (trainer:732) INFO: 9epoch:train:4241-4770batch: iter_time=3.563e-04, forward_time=0.216, loss_att=110.350, acc=0.906, loss=110.350, backward_time=0.328, grad_norm=104.195, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.967 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:23:40,851 (trainer:732) INFO: 9epoch:train:4771-5300batch: iter_time=3.551e-04, forward_time=0.214, loss_att=106.497, acc=0.907, loss=106.497, backward_time=0.328, grad_norm=105.330, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.955 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:30:14,025 (trainer:732) INFO: 9epoch:train:5301-5830batch: iter_time=3.721e-04, forward_time=0.215, loss_att=108.151, acc=0.909, loss=108.151, backward_time=0.329, grad_norm=100.440, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=2.969 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:36:46,245 (trainer:732) INFO: 9epoch:train:5831-6360batch: iter_time=3.697e-04, forward_time=0.215, loss_att=107.337, acc=0.908, loss=107.337, backward_time=0.329, grad_norm=105.336, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=2.958 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:43:17,129 (trainer:732) INFO: 9epoch:train:6361-6890batch: iter_time=3.972e-04, forward_time=0.214, loss_att=106.133, acc=0.909, loss=106.133, backward_time=0.329, grad_norm=104.171, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.952 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:49:46,985 (trainer:732) INFO: 9epoch:train:6891-7420batch: iter_time=3.609e-04, forward_time=0.212, loss_att=107.243, acc=0.908, loss=107.243, backward_time=0.328, grad_norm=108.454, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 12:56:16,362 (trainer:732) INFO: 9epoch:train:7421-7950batch: iter_time=3.385e-04, forward_time=0.212, loss_att=106.428, acc=0.909, loss=106.428, backward_time=0.328, grad_norm=104.647, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:02:47,731 (trainer:732) INFO: 9epoch:train:7951-8480batch: iter_time=3.590e-04, forward_time=0.213, loss_att=106.208, acc=0.909, loss=106.208, backward_time=0.329, grad_norm=101.075, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=2.952 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:09:17,223 (trainer:732) INFO: 9epoch:train:8481-9010batch: iter_time=3.592e-04, forward_time=0.212, loss_att=105.647, acc=0.910, loss=105.647, backward_time=0.328, grad_norm=107.248, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.002, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:15:46,381 (trainer:732) INFO: 9epoch:train:9011-9540batch: iter_time=3.569e-04, forward_time=0.213, loss_att=105.196, acc=0.909, loss=105.196, backward_time=0.327, grad_norm=101.953, clip=100.000, loss_scale=1.000, optim_step_time=0.133, optim0_lr0=0.002, train_time=2.936 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:22:16,880 (trainer:732) INFO: 9epoch:train:9541-10070batch: iter_time=3.752e-04, forward_time=0.213, loss_att=103.786, acc=0.911, loss=103.786, backward_time=0.328, grad_norm=105.904, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.002, train_time=2.948 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:28:46,965 (trainer:732) INFO: 9epoch:train:10071-10600batch: iter_time=3.937e-04, forward_time=0.214, loss_att=106.660, acc=0.909, loss=106.660, backward_time=0.328, grad_norm=106.106, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.002, train_time=2.942 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:36:48,702 (trainer:338) INFO: 9epoch results: [train] iter_time=5.420e-04, forward_time=0.220, loss_att=107.695, acc=0.907, loss=107.695, backward_time=0.329, grad_norm=105.138, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.002, train_time=3.086, time=2 hours, 16 minutes and 39.91 seconds, total_count=95535, gpu_max_cached_mem_GB=30.221, [valid] loss_att=86.251, acc=0.923, cer=0.095, wer=0.271, loss=86.251, time=3 minutes and 49.1 seconds, total_count=108, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 54.02 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:36:55,064 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:36:55,068 (trainer:272) INFO: 10/60epoch started. Estimated time to finish: 5 days, 2 hours and 49 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:46:10,615 (trainer:732) INFO: 10epoch:train:1-530batch: iter_time=0.003, forward_time=0.210, loss_att=102.166, acc=0.914, loss=102.166, backward_time=0.328, grad_norm=118.250, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=4.197 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:52:33,438 (trainer:732) INFO: 10epoch:train:531-1060batch: iter_time=2.670e-04, forward_time=0.209, loss_att=102.728, acc=0.913, loss=102.728, backward_time=0.329, grad_norm=105.553, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.888 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 13:58:55,989 (trainer:732) INFO: 10epoch:train:1061-1590batch: iter_time=2.663e-04, forward_time=0.209, loss_att=100.817, acc=0.913, loss=100.817, backward_time=0.329, grad_norm=99.264, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.888 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:05:14,843 (trainer:732) INFO: 10epoch:train:1591-2120batch: iter_time=2.965e-04, forward_time=0.207, loss_att=99.859, acc=0.912, loss=99.859, backward_time=0.325, grad_norm=110.384, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:11:36,297 (trainer:732) INFO: 10epoch:train:2121-2650batch: iter_time=2.751e-04, forward_time=0.209, loss_att=102.353, acc=0.914, loss=102.353, backward_time=0.328, grad_norm=103.741, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:17:57,312 (trainer:732) INFO: 10epoch:train:2651-3180batch: iter_time=2.727e-04, forward_time=0.208, loss_att=100.094, acc=0.913, loss=100.094, backward_time=0.328, grad_norm=102.789, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:24:17,169 (trainer:732) INFO: 10epoch:train:3181-3710batch: iter_time=2.658e-04, forward_time=0.207, loss_att=99.873, acc=0.913, loss=99.873, backward_time=0.326, grad_norm=101.976, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:30:36,982 (trainer:732) INFO: 10epoch:train:3711-4240batch: iter_time=2.662e-04, forward_time=0.207, loss_att=101.871, acc=0.912, loss=101.871, backward_time=0.326, grad_norm=102.441, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:36:57,003 (trainer:732) INFO: 10epoch:train:4241-4770batch: iter_time=2.498e-04, forward_time=0.207, loss_att=99.705, acc=0.913, loss=99.705, backward_time=0.326, grad_norm=102.685, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:43:17,122 (trainer:732) INFO: 10epoch:train:4771-5300batch: iter_time=2.473e-04, forward_time=0.207, loss_att=99.063, acc=0.915, loss=99.063, backward_time=0.327, grad_norm=97.088, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:49:39,485 (trainer:732) INFO: 10epoch:train:5301-5830batch: iter_time=2.817e-04, forward_time=0.211, loss_att=100.056, acc=0.915, loss=100.056, backward_time=0.328, grad_norm=104.276, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.887 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 14:56:00,585 (trainer:732) INFO: 10epoch:train:5831-6360batch: iter_time=2.727e-04, forward_time=0.208, loss_att=98.506, acc=0.915, loss=98.506, backward_time=0.327, grad_norm=99.752, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:02:16,987 (trainer:732) INFO: 10epoch:train:6361-6890batch: iter_time=2.670e-04, forward_time=0.206, loss_att=98.845, acc=0.913, loss=98.845, backward_time=0.324, grad_norm=99.659, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.841 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:08:38,525 (trainer:732) INFO: 10epoch:train:6891-7420batch: iter_time=2.624e-04, forward_time=0.209, loss_att=98.738, acc=0.915, loss=98.738, backward_time=0.329, grad_norm=96.360, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:14:56,925 (trainer:732) INFO: 10epoch:train:7421-7950batch: iter_time=2.769e-04, forward_time=0.207, loss_att=95.270, acc=0.916, loss=95.270, backward_time=0.326, grad_norm=108.357, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:21:15,434 (trainer:732) INFO: 10epoch:train:7951-8480batch: iter_time=2.588e-04, forward_time=0.206, loss_att=93.747, acc=0.916, loss=93.747, backward_time=0.325, grad_norm=101.109, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:27:36,830 (trainer:732) INFO: 10epoch:train:8481-9010batch: iter_time=2.515e-04, forward_time=0.208, loss_att=97.102, acc=0.917, loss=97.102, backward_time=0.329, grad_norm=99.966, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:33:57,884 (trainer:732) INFO: 10epoch:train:9011-9540batch: iter_time=2.492e-04, forward_time=0.208, loss_att=97.386, acc=0.918, loss=97.386, backward_time=0.328, grad_norm=103.383, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:40:16,206 (trainer:732) INFO: 10epoch:train:9541-10070batch: iter_time=2.655e-04, forward_time=0.207, loss_att=95.721, acc=0.917, loss=95.721, backward_time=0.326, grad_norm=98.140, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.856 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:46:35,569 (trainer:732) INFO: 10epoch:train:10071-10600batch: iter_time=2.564e-04, forward_time=0.207, loss_att=98.793, acc=0.916, loss=98.793, backward_time=0.326, grad_norm=105.059, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:54:39,675 (trainer:338) INFO: 10epoch results: [train] iter_time=4.263e-04, forward_time=0.208, loss_att=99.088, acc=0.914, loss=99.088, backward_time=0.327, grad_norm=103.012, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.935, time=2 hours, 9 minutes and 55.13 seconds, total_count=106150, gpu_max_cached_mem_GB=30.221, [valid] loss_att=84.361, acc=0.925, cer=0.094, wer=0.264, loss=84.361, time=4 minutes and 6.77 seconds, total_count=120, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 42.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:54:45,470 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 15:54:45,474 (trainer:272) INFO: 11/60epoch started. Estimated time to finish: 4 days, 21 hours and 38 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:03:50,915 (trainer:732) INFO: 11epoch:train:1-530batch: iter_time=0.003, forward_time=0.208, loss_att=93.775, acc=0.920, loss=93.775, backward_time=0.327, grad_norm=103.984, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=4.122 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:10:10,003 (trainer:732) INFO: 11epoch:train:531-1060batch: iter_time=2.729e-04, forward_time=0.208, loss_att=93.559, acc=0.919, loss=93.559, backward_time=0.327, grad_norm=96.670, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.002, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:16:29,846 (trainer:732) INFO: 11epoch:train:1061-1590batch: iter_time=2.603e-04, forward_time=0.208, loss_att=94.037, acc=0.919, loss=94.037, backward_time=0.327, grad_norm=105.272, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:22:50,164 (trainer:732) INFO: 11epoch:train:1591-2120batch: iter_time=2.600e-04, forward_time=0.207, loss_att=92.883, acc=0.920, loss=92.883, backward_time=0.327, grad_norm=102.349, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:29:09,847 (trainer:732) INFO: 11epoch:train:2121-2650batch: iter_time=2.786e-04, forward_time=0.208, loss_att=93.225, acc=0.919, loss=93.225, backward_time=0.327, grad_norm=97.990, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:35:28,200 (trainer:732) INFO: 11epoch:train:2651-3180batch: iter_time=2.754e-04, forward_time=0.207, loss_att=91.866, acc=0.918, loss=91.866, backward_time=0.326, grad_norm=98.708, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:41:49,769 (trainer:732) INFO: 11epoch:train:3181-3710batch: iter_time=2.604e-04, forward_time=0.209, loss_att=94.260, acc=0.921, loss=94.260, backward_time=0.329, grad_norm=100.846, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.881 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:48:11,345 (trainer:732) INFO: 11epoch:train:3711-4240batch: iter_time=2.739e-04, forward_time=0.209, loss_att=92.331, acc=0.920, loss=92.331, backward_time=0.328, grad_norm=100.645, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 16:54:28,496 (trainer:732) INFO: 11epoch:train:4241-4770batch: iter_time=2.622e-04, forward_time=0.207, loss_att=90.641, acc=0.920, loss=90.641, backward_time=0.325, grad_norm=98.484, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.847 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:00:48,826 (trainer:732) INFO: 11epoch:train:4771-5300batch: iter_time=2.504e-04, forward_time=0.208, loss_att=92.767, acc=0.921, loss=92.767, backward_time=0.328, grad_norm=101.130, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:07:08,504 (trainer:732) INFO: 11epoch:train:5301-5830batch: iter_time=2.677e-04, forward_time=0.208, loss_att=93.959, acc=0.919, loss=93.959, backward_time=0.327, grad_norm=99.718, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:13:28,879 (trainer:732) INFO: 11epoch:train:5831-6360batch: iter_time=2.954e-04, forward_time=0.209, loss_att=92.038, acc=0.920, loss=92.038, backward_time=0.327, grad_norm=96.661, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:19:54,378 (trainer:732) INFO: 11epoch:train:6361-6890batch: iter_time=2.786e-04, forward_time=0.209, loss_att=91.525, acc=0.920, loss=91.525, backward_time=0.327, grad_norm=97.907, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.910 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:26:12,827 (trainer:732) INFO: 11epoch:train:6891-7420batch: iter_time=2.548e-04, forward_time=0.207, loss_att=90.595, acc=0.920, loss=90.595, backward_time=0.325, grad_norm=100.262, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:32:30,095 (trainer:732) INFO: 11epoch:train:7421-7950batch: iter_time=2.579e-04, forward_time=0.207, loss_att=88.876, acc=0.921, loss=88.876, backward_time=0.326, grad_norm=97.586, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.848 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:38:55,316 (trainer:732) INFO: 11epoch:train:7951-8480batch: iter_time=2.534e-04, forward_time=0.211, loss_att=90.253, acc=0.922, loss=90.253, backward_time=0.333, grad_norm=100.575, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.906 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:45:14,001 (trainer:732) INFO: 11epoch:train:8481-9010batch: iter_time=2.568e-04, forward_time=0.208, loss_att=91.696, acc=0.921, loss=91.696, backward_time=0.327, grad_norm=98.594, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.859 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:51:32,498 (trainer:732) INFO: 11epoch:train:9011-9540batch: iter_time=2.682e-04, forward_time=0.208, loss_att=91.003, acc=0.921, loss=91.003, backward_time=0.326, grad_norm=99.016, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 17:57:52,411 (trainer:732) INFO: 11epoch:train:9541-10070batch: iter_time=2.512e-04, forward_time=0.209, loss_att=91.092, acc=0.921, loss=91.092, backward_time=0.328, grad_norm=104.544, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:04:11,615 (trainer:732) INFO: 11epoch:train:10071-10600batch: iter_time=2.437e-04, forward_time=0.208, loss_att=92.590, acc=0.922, loss=92.590, backward_time=0.328, grad_norm=101.250, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:12:03,639 (trainer:338) INFO: 11epoch results: [train] iter_time=3.915e-04, forward_time=0.208, loss_att=92.145, acc=0.920, loss=92.145, backward_time=0.327, grad_norm=100.108, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.930, time=2 hours, 9 minutes and 45.29 seconds, total_count=116765, gpu_max_cached_mem_GB=30.221, [valid] loss_att=80.034, acc=0.929, cer=0.088, wer=0.253, loss=80.034, time=3 minutes and 59.74 seconds, total_count=132, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 33.14 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:12:09,022 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:12:09,028 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/1epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:12:09,028 (trainer:272) INFO: 12/60epoch started. Estimated time to finish: 4 days, 18 hours and 15 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:21:15,008 (trainer:732) INFO: 12epoch:train:1-530batch: iter_time=0.003, forward_time=0.209, loss_att=88.195, acc=0.925, loss=88.195, backward_time=0.328, grad_norm=112.369, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=4.126 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:27:33,475 (trainer:732) INFO: 12epoch:train:531-1060batch: iter_time=2.956e-04, forward_time=0.207, loss_att=86.632, acc=0.924, loss=86.632, backward_time=0.326, grad_norm=99.413, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:33:52,070 (trainer:732) INFO: 12epoch:train:1061-1590batch: iter_time=2.914e-04, forward_time=0.208, loss_att=86.413, acc=0.924, loss=86.413, backward_time=0.326, grad_norm=99.837, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.002, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:40:10,483 (trainer:732) INFO: 12epoch:train:1591-2120batch: iter_time=2.919e-04, forward_time=0.208, loss_att=87.114, acc=0.924, loss=87.114, backward_time=0.326, grad_norm=99.167, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:46:29,126 (trainer:732) INFO: 12epoch:train:2121-2650batch: iter_time=2.802e-04, forward_time=0.208, loss_att=86.086, acc=0.924, loss=86.086, backward_time=0.326, grad_norm=103.901, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:52:49,054 (trainer:732) INFO: 12epoch:train:2651-3180batch: iter_time=2.746e-04, forward_time=0.209, loss_att=87.225, acc=0.924, loss=87.225, backward_time=0.327, grad_norm=102.432, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 18:59:09,903 (trainer:732) INFO: 12epoch:train:3181-3710batch: iter_time=2.853e-04, forward_time=0.209, loss_att=90.009, acc=0.924, loss=90.009, backward_time=0.328, grad_norm=91.863, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:05:30,246 (trainer:732) INFO: 12epoch:train:3711-4240batch: iter_time=2.838e-04, forward_time=0.209, loss_att=87.496, acc=0.925, loss=87.496, backward_time=0.327, grad_norm=97.540, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:11:49,127 (trainer:732) INFO: 12epoch:train:4241-4770batch: iter_time=2.634e-04, forward_time=0.208, loss_att=86.990, acc=0.924, loss=86.990, backward_time=0.326, grad_norm=96.813, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:18:09,523 (trainer:732) INFO: 12epoch:train:4771-5300batch: iter_time=2.736e-04, forward_time=0.208, loss_att=86.179, acc=0.925, loss=86.179, backward_time=0.327, grad_norm=101.479, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:24:30,960 (trainer:732) INFO: 12epoch:train:5301-5830batch: iter_time=2.846e-04, forward_time=0.209, loss_att=87.784, acc=0.924, loss=87.784, backward_time=0.328, grad_norm=98.330, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:30:51,175 (trainer:732) INFO: 12epoch:train:5831-6360batch: iter_time=2.644e-04, forward_time=0.208, loss_att=86.661, acc=0.924, loss=86.661, backward_time=0.326, grad_norm=93.636, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:37:11,244 (trainer:732) INFO: 12epoch:train:6361-6890batch: iter_time=2.782e-04, forward_time=0.209, loss_att=87.355, acc=0.925, loss=87.355, backward_time=0.327, grad_norm=101.489, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:43:32,506 (trainer:732) INFO: 12epoch:train:6891-7420batch: iter_time=2.752e-04, forward_time=0.208, loss_att=85.098, acc=0.926, loss=85.098, backward_time=0.327, grad_norm=98.548, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:49:51,957 (trainer:732) INFO: 12epoch:train:7421-7950batch: iter_time=2.809e-04, forward_time=0.208, loss_att=85.900, acc=0.925, loss=85.900, backward_time=0.326, grad_norm=94.532, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 19:56:13,623 (trainer:732) INFO: 12epoch:train:7951-8480batch: iter_time=2.860e-04, forward_time=0.209, loss_att=87.003, acc=0.926, loss=87.003, backward_time=0.328, grad_norm=98.357, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.878 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:02:34,225 (trainer:732) INFO: 12epoch:train:8481-9010batch: iter_time=2.828e-04, forward_time=0.209, loss_att=85.184, acc=0.926, loss=85.184, backward_time=0.327, grad_norm=102.660, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:08:56,656 (trainer:732) INFO: 12epoch:train:9011-9540batch: iter_time=2.973e-04, forward_time=0.210, loss_att=87.746, acc=0.926, loss=87.746, backward_time=0.329, grad_norm=96.112, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.884 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:15:16,543 (trainer:732) INFO: 12epoch:train:9541-10070batch: iter_time=2.792e-04, forward_time=0.209, loss_att=85.907, acc=0.925, loss=85.907, backward_time=0.327, grad_norm=97.656, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:21:38,236 (trainer:732) INFO: 12epoch:train:10071-10600batch: iter_time=2.739e-04, forward_time=0.209, loss_att=86.917, acc=0.926, loss=86.917, backward_time=0.328, grad_norm=97.057, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.002, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:29:41,763 (trainer:338) INFO: 12epoch results: [train] iter_time=3.923e-04, forward_time=0.208, loss_att=86.891, acc=0.925, loss=86.891, backward_time=0.327, grad_norm=99.193, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.931, time=2 hours, 9 minutes and 47.29 seconds, total_count=127380, gpu_max_cached_mem_GB=30.221, [valid] loss_att=78.122, acc=0.931, cer=0.088, wer=0.248, loss=78.122, time=3 minutes and 59.73 seconds, total_count=144, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 45.67 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:29:47,437 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:29:47,443 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/2epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:29:47,443 (trainer:272) INFO: 13/60epoch started. Estimated time to finish: 4 days, 15 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:39:00,366 (trainer:732) INFO: 13epoch:train:1-530batch: iter_time=0.003, forward_time=0.210, loss_att=82.802, acc=0.929, loss=82.802, backward_time=0.329, grad_norm=95.504, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=4.178 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:45:21,290 (trainer:732) INFO: 13epoch:train:531-1060batch: iter_time=3.250e-04, forward_time=0.210, loss_att=85.727, acc=0.927, loss=85.727, backward_time=0.328, grad_norm=98.685, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:51:42,375 (trainer:732) INFO: 13epoch:train:1061-1590batch: iter_time=2.921e-04, forward_time=0.209, loss_att=83.588, acc=0.928, loss=83.588, backward_time=0.328, grad_norm=97.023, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 20:58:03,393 (trainer:732) INFO: 13epoch:train:1591-2120batch: iter_time=2.805e-04, forward_time=0.209, loss_att=84.096, acc=0.928, loss=84.096, backward_time=0.327, grad_norm=99.106, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:04:22,993 (trainer:732) INFO: 13epoch:train:2121-2650batch: iter_time=2.655e-04, forward_time=0.208, loss_att=84.202, acc=0.928, loss=84.202, backward_time=0.327, grad_norm=99.233, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:10:41,289 (trainer:732) INFO: 13epoch:train:2651-3180batch: iter_time=2.718e-04, forward_time=0.207, loss_att=81.634, acc=0.928, loss=81.634, backward_time=0.325, grad_norm=94.143, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.853 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:17:04,120 (trainer:732) INFO: 13epoch:train:3181-3710batch: iter_time=2.941e-04, forward_time=0.210, loss_att=84.076, acc=0.929, loss=84.076, backward_time=0.329, grad_norm=98.150, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.890 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:23:25,791 (trainer:732) INFO: 13epoch:train:3711-4240batch: iter_time=2.952e-04, forward_time=0.209, loss_att=81.814, acc=0.929, loss=81.814, backward_time=0.328, grad_norm=97.637, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:29:45,959 (trainer:732) INFO: 13epoch:train:4241-4770batch: iter_time=2.744e-04, forward_time=0.208, loss_att=81.810, acc=0.929, loss=81.810, backward_time=0.327, grad_norm=98.290, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.002, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:36:05,823 (trainer:732) INFO: 13epoch:train:4771-5300batch: iter_time=2.820e-04, forward_time=0.208, loss_att=82.458, acc=0.929, loss=82.458, backward_time=0.327, grad_norm=97.468, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:42:24,221 (trainer:732) INFO: 13epoch:train:5301-5830batch: iter_time=2.743e-04, forward_time=0.207, loss_att=80.803, acc=0.929, loss=80.803, backward_time=0.325, grad_norm=99.378, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:48:44,646 (trainer:732) INFO: 13epoch:train:5831-6360batch: iter_time=2.780e-04, forward_time=0.208, loss_att=80.881, acc=0.929, loss=80.881, backward_time=0.326, grad_norm=100.512, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 21:55:03,343 (trainer:732) INFO: 13epoch:train:6361-6890batch: iter_time=2.815e-04, forward_time=0.208, loss_att=84.103, acc=0.928, loss=84.103, backward_time=0.326, grad_norm=96.768, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.859 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:01:23,948 (trainer:732) INFO: 13epoch:train:6891-7420batch: iter_time=2.764e-04, forward_time=0.209, loss_att=84.653, acc=0.927, loss=84.653, backward_time=0.328, grad_norm=102.425, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:07:44,505 (trainer:732) INFO: 13epoch:train:7421-7950batch: iter_time=3.095e-04, forward_time=0.209, loss_att=80.752, acc=0.930, loss=80.752, backward_time=0.327, grad_norm=102.765, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:14:02,518 (trainer:732) INFO: 13epoch:train:7951-8480batch: iter_time=2.879e-04, forward_time=0.207, loss_att=78.983, acc=0.929, loss=78.983, backward_time=0.325, grad_norm=98.848, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:20:22,854 (trainer:732) INFO: 13epoch:train:8481-9010batch: iter_time=2.865e-04, forward_time=0.208, loss_att=80.065, acc=0.930, loss=80.065, backward_time=0.326, grad_norm=97.517, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:26:40,533 (trainer:732) INFO: 13epoch:train:9011-9540batch: iter_time=2.774e-04, forward_time=0.207, loss_att=80.407, acc=0.929, loss=80.407, backward_time=0.325, grad_norm=99.624, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:33:01,958 (trainer:732) INFO: 13epoch:train:9541-10070batch: iter_time=2.908e-04, forward_time=0.209, loss_att=80.453, acc=0.930, loss=80.453, backward_time=0.328, grad_norm=103.996, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:39:23,492 (trainer:732) INFO: 13epoch:train:10071-10600batch: iter_time=2.789e-04, forward_time=0.209, loss_att=81.495, acc=0.930, loss=81.495, backward_time=0.328, grad_norm=98.732, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.878 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:47:37,279 (trainer:338) INFO: 13epoch results: [train] iter_time=4.372e-04, forward_time=0.209, loss_att=82.205, acc=0.929, loss=82.205, backward_time=0.327, grad_norm=98.804, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.934, time=2 hours, 9 minutes and 55.34 seconds, total_count=137995, gpu_max_cached_mem_GB=30.221, [valid] loss_att=73.934, acc=0.934, cer=0.081, wer=0.235, loss=73.934, time=4 minutes and 9.99 seconds, total_count=156, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 44.5 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:47:42,873 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:47:42,878 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/3epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:47:42,879 (trainer:272) INFO: 14/60epoch started. Estimated time to finish: 4 days, 12 hours and 55 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 22:56:58,181 (trainer:732) INFO: 14epoch:train:1-530batch: iter_time=0.004, forward_time=0.209, loss_att=79.640, acc=0.932, loss=79.640, backward_time=0.327, grad_norm=102.191, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.002, train_time=4.197 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:03:19,469 (trainer:732) INFO: 14epoch:train:531-1060batch: iter_time=3.057e-04, forward_time=0.210, loss_att=80.240, acc=0.932, loss=80.240, backward_time=0.329, grad_norm=102.490, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:09:38,930 (trainer:732) INFO: 14epoch:train:1061-1590batch: iter_time=2.928e-04, forward_time=0.209, loss_att=80.316, acc=0.930, loss=80.316, backward_time=0.326, grad_norm=101.456, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.002, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:15:59,481 (trainer:732) INFO: 14epoch:train:1591-2120batch: iter_time=3.003e-04, forward_time=0.209, loss_att=78.854, acc=0.931, loss=78.854, backward_time=0.327, grad_norm=97.860, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.002, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:22:20,604 (trainer:732) INFO: 14epoch:train:2121-2650batch: iter_time=2.892e-04, forward_time=0.209, loss_att=80.266, acc=0.930, loss=80.266, backward_time=0.327, grad_norm=100.038, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:28:41,513 (trainer:732) INFO: 14epoch:train:2651-3180batch: iter_time=2.951e-04, forward_time=0.209, loss_att=78.937, acc=0.932, loss=78.937, backward_time=0.328, grad_norm=99.878, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:35:02,193 (trainer:732) INFO: 14epoch:train:3181-3710batch: iter_time=2.922e-04, forward_time=0.209, loss_att=78.476, acc=0.932, loss=78.476, backward_time=0.327, grad_norm=98.879, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.002, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:41:22,119 (trainer:732) INFO: 14epoch:train:3711-4240batch: iter_time=2.942e-04, forward_time=0.209, loss_att=78.264, acc=0.932, loss=78.264, backward_time=0.327, grad_norm=92.336, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.002, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:47:42,357 (trainer:732) INFO: 14epoch:train:4241-4770batch: iter_time=2.917e-04, forward_time=0.208, loss_att=79.047, acc=0.931, loss=79.047, backward_time=0.327, grad_norm=100.913, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-21 23:54:01,964 (trainer:732) INFO: 14epoch:train:4771-5300batch: iter_time=2.873e-04, forward_time=0.209, loss_att=79.981, acc=0.930, loss=79.981, backward_time=0.326, grad_norm=95.009, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:00:21,729 (trainer:732) INFO: 14epoch:train:5301-5830batch: iter_time=2.661e-04, forward_time=0.208, loss_att=78.500, acc=0.932, loss=78.500, backward_time=0.326, grad_norm=110.143, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:06:39,974 (trainer:732) INFO: 14epoch:train:5831-6360batch: iter_time=2.892e-04, forward_time=0.208, loss_att=79.758, acc=0.930, loss=79.758, backward_time=0.326, grad_norm=100.114, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.853 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:13:02,060 (trainer:732) INFO: 14epoch:train:6361-6890batch: iter_time=2.988e-04, forward_time=0.210, loss_att=79.145, acc=0.931, loss=79.145, backward_time=0.329, grad_norm=100.141, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.885 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:19:23,110 (trainer:732) INFO: 14epoch:train:6891-7420batch: iter_time=3.393e-04, forward_time=0.210, loss_att=77.734, acc=0.933, loss=77.734, backward_time=0.328, grad_norm=95.135, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:25:43,380 (trainer:732) INFO: 14epoch:train:7421-7950batch: iter_time=2.986e-04, forward_time=0.209, loss_att=78.146, acc=0.932, loss=78.146, backward_time=0.327, grad_norm=93.824, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:32:01,423 (trainer:732) INFO: 14epoch:train:7951-8480batch: iter_time=2.869e-04, forward_time=0.208, loss_att=76.540, acc=0.932, loss=76.540, backward_time=0.325, grad_norm=90.408, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:38:20,098 (trainer:732) INFO: 14epoch:train:8481-9010batch: iter_time=2.847e-04, forward_time=0.208, loss_att=76.655, acc=0.933, loss=76.655, backward_time=0.326, grad_norm=95.024, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:44:41,715 (trainer:732) INFO: 14epoch:train:9011-9540batch: iter_time=2.994e-04, forward_time=0.210, loss_att=77.535, acc=0.933, loss=77.535, backward_time=0.329, grad_norm=100.780, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:51:02,495 (trainer:732) INFO: 14epoch:train:9541-10070batch: iter_time=2.909e-04, forward_time=0.209, loss_att=77.838, acc=0.933, loss=77.838, backward_time=0.327, grad_norm=98.052, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 00:57:23,972 (trainer:732) INFO: 14epoch:train:10071-10600batch: iter_time=2.832e-04, forward_time=0.209, loss_att=78.255, acc=0.932, loss=78.255, backward_time=0.327, grad_norm=101.374, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.878 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:05:39,140 (trainer:338) INFO: 14epoch results: [train] iter_time=4.757e-04, forward_time=0.209, loss_att=78.711, acc=0.932, loss=78.711, backward_time=0.327, grad_norm=98.782, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.936, time=2 hours, 10 minutes and 0.1 seconds, total_count=148610, gpu_max_cached_mem_GB=30.221, [valid] loss_att=71.796, acc=0.936, cer=0.081, wer=0.232, loss=71.796, time=4 minutes and 11.2 seconds, total_count=168, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 44.92 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:05:45,138 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:05:45,144 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/4epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:05:45,144 (trainer:272) INFO: 15/60epoch started. Estimated time to finish: 4 days, 10 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:15:01,960 (trainer:732) INFO: 15epoch:train:1-530batch: iter_time=0.003, forward_time=0.210, loss_att=75.803, acc=0.935, loss=75.803, backward_time=0.329, grad_norm=95.252, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=4.207 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:21:23,031 (trainer:732) INFO: 15epoch:train:531-1060batch: iter_time=2.858e-04, forward_time=0.209, loss_att=75.079, acc=0.934, loss=75.079, backward_time=0.328, grad_norm=94.118, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:27:44,297 (trainer:732) INFO: 15epoch:train:1061-1590batch: iter_time=2.982e-04, forward_time=0.210, loss_att=75.617, acc=0.934, loss=75.617, backward_time=0.328, grad_norm=105.980, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.878 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:34:05,284 (trainer:732) INFO: 15epoch:train:1591-2120batch: iter_time=2.765e-04, forward_time=0.209, loss_att=77.546, acc=0.934, loss=77.546, backward_time=0.329, grad_norm=97.184, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:40:25,268 (trainer:732) INFO: 15epoch:train:2121-2650batch: iter_time=2.900e-04, forward_time=0.209, loss_att=75.924, acc=0.934, loss=75.924, backward_time=0.326, grad_norm=95.809, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:46:45,223 (trainer:732) INFO: 15epoch:train:2651-3180batch: iter_time=2.776e-04, forward_time=0.208, loss_att=75.399, acc=0.934, loss=75.399, backward_time=0.327, grad_norm=96.995, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:53:06,985 (trainer:732) INFO: 15epoch:train:3181-3710batch: iter_time=2.880e-04, forward_time=0.209, loss_att=79.030, acc=0.934, loss=79.030, backward_time=0.329, grad_norm=96.908, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.882 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 01:59:28,136 (trainer:732) INFO: 15epoch:train:3711-4240batch: iter_time=2.873e-04, forward_time=0.209, loss_att=76.681, acc=0.934, loss=76.681, backward_time=0.327, grad_norm=96.724, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:05:48,024 (trainer:732) INFO: 15epoch:train:4241-4770batch: iter_time=2.984e-04, forward_time=0.208, loss_att=74.926, acc=0.934, loss=74.926, backward_time=0.326, grad_norm=100.354, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:12:09,424 (trainer:732) INFO: 15epoch:train:4771-5300batch: iter_time=2.966e-04, forward_time=0.209, loss_att=77.320, acc=0.934, loss=77.320, backward_time=0.328, grad_norm=98.264, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:18:28,475 (trainer:732) INFO: 15epoch:train:5301-5830batch: iter_time=2.775e-04, forward_time=0.208, loss_att=74.227, acc=0.935, loss=74.227, backward_time=0.326, grad_norm=96.966, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.862 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:24:49,315 (trainer:732) INFO: 15epoch:train:5831-6360batch: iter_time=2.862e-04, forward_time=0.208, loss_att=75.883, acc=0.934, loss=75.883, backward_time=0.327, grad_norm=94.454, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:31:09,667 (trainer:732) INFO: 15epoch:train:6361-6890batch: iter_time=2.869e-04, forward_time=0.209, loss_att=74.998, acc=0.935, loss=74.998, backward_time=0.327, grad_norm=99.802, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:37:28,326 (trainer:732) INFO: 15epoch:train:6891-7420batch: iter_time=2.857e-04, forward_time=0.208, loss_att=73.957, acc=0.935, loss=73.957, backward_time=0.326, grad_norm=94.869, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:43:46,337 (trainer:732) INFO: 15epoch:train:7421-7950batch: iter_time=2.935e-04, forward_time=0.207, loss_att=73.536, acc=0.935, loss=73.536, backward_time=0.325, grad_norm=98.811, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.854 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:50:05,963 (trainer:732) INFO: 15epoch:train:7951-8480batch: iter_time=2.734e-04, forward_time=0.208, loss_att=75.411, acc=0.934, loss=75.411, backward_time=0.326, grad_norm=93.865, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 02:56:24,672 (trainer:732) INFO: 15epoch:train:8481-9010batch: iter_time=2.813e-04, forward_time=0.208, loss_att=74.348, acc=0.934, loss=74.348, backward_time=0.325, grad_norm=93.154, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.859 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:02:44,302 (trainer:732) INFO: 15epoch:train:9011-9540batch: iter_time=2.869e-04, forward_time=0.208, loss_att=74.879, acc=0.934, loss=74.879, backward_time=0.326, grad_norm=97.801, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:09:05,526 (trainer:732) INFO: 15epoch:train:9541-10070batch: iter_time=2.765e-04, forward_time=0.209, loss_att=75.019, acc=0.936, loss=75.019, backward_time=0.328, grad_norm=101.285, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:15:25,248 (trainer:732) INFO: 15epoch:train:10071-10600batch: iter_time=2.860e-04, forward_time=0.208, loss_att=75.580, acc=0.935, loss=75.580, backward_time=0.327, grad_norm=92.459, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:23:38,301 (trainer:338) INFO: 15epoch results: [train] iter_time=4.177e-04, forward_time=0.209, loss_att=75.538, acc=0.934, loss=75.538, backward_time=0.327, grad_norm=97.065, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.935, time=2 hours, 9 minutes and 59.62 seconds, total_count=159225, gpu_max_cached_mem_GB=30.221, [valid] loss_att=71.431, acc=0.937, cer=0.079, wer=0.230, loss=71.431, time=4 minutes and 5.65 seconds, total_count=180, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 47.86 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:23:43,952 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:23:43,959 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/5epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:23:43,959 (trainer:272) INFO: 16/60epoch started. Estimated time to finish: 4 days, 8 hours and 4 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:33:00,005 (trainer:732) INFO: 16epoch:train:1-530batch: iter_time=0.002, forward_time=0.208, loss_att=71.700, acc=0.937, loss=71.700, backward_time=0.325, grad_norm=96.666, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=4.203 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:39:23,292 (trainer:732) INFO: 16epoch:train:531-1060batch: iter_time=2.851e-04, forward_time=0.210, loss_att=74.656, acc=0.937, loss=74.656, backward_time=0.329, grad_norm=97.293, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.890 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:45:43,779 (trainer:732) INFO: 16epoch:train:1061-1590batch: iter_time=2.955e-04, forward_time=0.209, loss_att=73.160, acc=0.936, loss=73.160, backward_time=0.328, grad_norm=96.017, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:52:05,326 (trainer:732) INFO: 16epoch:train:1591-2120batch: iter_time=2.889e-04, forward_time=0.209, loss_att=73.309, acc=0.937, loss=73.309, backward_time=0.328, grad_norm=98.950, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 03:58:24,830 (trainer:732) INFO: 16epoch:train:2121-2650batch: iter_time=2.731e-04, forward_time=0.208, loss_att=74.580, acc=0.936, loss=74.580, backward_time=0.327, grad_norm=99.056, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:04:44,274 (trainer:732) INFO: 16epoch:train:2651-3180batch: iter_time=2.988e-04, forward_time=0.207, loss_att=72.100, acc=0.937, loss=72.100, backward_time=0.325, grad_norm=97.782, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:11:04,008 (trainer:732) INFO: 16epoch:train:3181-3710batch: iter_time=2.857e-04, forward_time=0.209, loss_att=72.621, acc=0.936, loss=72.621, backward_time=0.328, grad_norm=94.706, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.001, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:17:23,625 (trainer:732) INFO: 16epoch:train:3711-4240batch: iter_time=2.995e-04, forward_time=0.208, loss_att=73.823, acc=0.936, loss=73.823, backward_time=0.326, grad_norm=96.096, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.862 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:23:43,970 (trainer:732) INFO: 16epoch:train:4241-4770batch: iter_time=2.879e-04, forward_time=0.209, loss_att=73.081, acc=0.936, loss=73.081, backward_time=0.327, grad_norm=92.714, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:30:04,751 (trainer:732) INFO: 16epoch:train:4771-5300batch: iter_time=2.767e-04, forward_time=0.209, loss_att=72.919, acc=0.936, loss=72.919, backward_time=0.327, grad_norm=93.992, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:36:23,672 (trainer:732) INFO: 16epoch:train:5301-5830batch: iter_time=2.811e-04, forward_time=0.208, loss_att=72.242, acc=0.936, loss=72.242, backward_time=0.326, grad_norm=99.637, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:42:44,239 (trainer:732) INFO: 16epoch:train:5831-6360batch: iter_time=2.830e-04, forward_time=0.209, loss_att=72.931, acc=0.937, loss=72.931, backward_time=0.327, grad_norm=90.363, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:49:03,998 (trainer:732) INFO: 16epoch:train:6361-6890batch: iter_time=2.855e-04, forward_time=0.209, loss_att=72.918, acc=0.936, loss=72.918, backward_time=0.327, grad_norm=98.369, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 04:55:23,977 (trainer:732) INFO: 16epoch:train:6891-7420batch: iter_time=2.751e-04, forward_time=0.208, loss_att=72.073, acc=0.937, loss=72.073, backward_time=0.326, grad_norm=101.960, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:01:45,466 (trainer:732) INFO: 16epoch:train:7421-7950batch: iter_time=2.799e-04, forward_time=0.210, loss_att=72.650, acc=0.938, loss=72.650, backward_time=0.328, grad_norm=93.643, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:08:06,133 (trainer:732) INFO: 16epoch:train:7951-8480batch: iter_time=2.811e-04, forward_time=0.209, loss_att=72.886, acc=0.937, loss=72.886, backward_time=0.327, grad_norm=93.683, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:14:25,438 (trainer:732) INFO: 16epoch:train:8481-9010batch: iter_time=2.942e-04, forward_time=0.208, loss_att=71.268, acc=0.936, loss=71.268, backward_time=0.327, grad_norm=94.767, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:20:46,163 (trainer:732) INFO: 16epoch:train:9011-9540batch: iter_time=2.761e-04, forward_time=0.208, loss_att=71.805, acc=0.937, loss=71.805, backward_time=0.327, grad_norm=92.010, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:27:05,814 (trainer:732) INFO: 16epoch:train:9541-10070batch: iter_time=2.863e-04, forward_time=0.208, loss_att=72.559, acc=0.937, loss=72.559, backward_time=0.327, grad_norm=104.276, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:33:26,497 (trainer:732) INFO: 16epoch:train:10071-10600batch: iter_time=2.669e-04, forward_time=0.209, loss_att=73.589, acc=0.936, loss=73.589, backward_time=0.327, grad_norm=91.663, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:41:36,507 (trainer:338) INFO: 16epoch results: [train] iter_time=3.803e-04, forward_time=0.209, loss_att=72.850, acc=0.937, loss=72.850, backward_time=0.327, grad_norm=96.179, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.936, time=2 hours, 10 minutes and 3.27 seconds, total_count=169840, gpu_max_cached_mem_GB=30.221, [valid] loss_att=70.722, acc=0.937, cer=0.078, wer=0.225, loss=70.722, time=4 minutes and 3.71 seconds, total_count=192, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 45.54 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:41:42,450 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:41:42,457 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/6epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:41:42,457 (trainer:272) INFO: 17/60epoch started. Estimated time to finish: 4 days, 5 hours and 41 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:50:48,544 (trainer:732) INFO: 17epoch:train:1-530batch: iter_time=0.002, forward_time=0.206, loss_att=70.070, acc=0.938, loss=70.070, backward_time=0.323, grad_norm=99.924, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=4.126 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 05:57:09,790 (trainer:732) INFO: 17epoch:train:531-1060batch: iter_time=2.980e-04, forward_time=0.209, loss_att=70.816, acc=0.939, loss=70.816, backward_time=0.328, grad_norm=88.404, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:03:29,251 (trainer:732) INFO: 17epoch:train:1061-1590batch: iter_time=2.946e-04, forward_time=0.208, loss_att=71.979, acc=0.937, loss=71.979, backward_time=0.326, grad_norm=96.134, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.864 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:09:50,522 (trainer:732) INFO: 17epoch:train:1591-2120batch: iter_time=2.863e-04, forward_time=0.209, loss_att=71.022, acc=0.939, loss=71.022, backward_time=0.328, grad_norm=101.077, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:16:12,488 (trainer:732) INFO: 17epoch:train:2121-2650batch: iter_time=2.872e-04, forward_time=0.209, loss_att=71.864, acc=0.939, loss=71.864, backward_time=0.328, grad_norm=91.840, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.883 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:22:33,215 (trainer:732) INFO: 17epoch:train:2651-3180batch: iter_time=2.875e-04, forward_time=0.209, loss_att=72.657, acc=0.938, loss=72.657, backward_time=0.327, grad_norm=92.225, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:28:55,174 (trainer:732) INFO: 17epoch:train:3181-3710batch: iter_time=2.748e-04, forward_time=0.209, loss_att=70.747, acc=0.939, loss=70.747, backward_time=0.329, grad_norm=87.423, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.882 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:35:16,061 (trainer:732) INFO: 17epoch:train:3711-4240batch: iter_time=2.952e-04, forward_time=0.209, loss_att=70.358, acc=0.938, loss=70.358, backward_time=0.327, grad_norm=95.337, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:41:37,046 (trainer:732) INFO: 17epoch:train:4241-4770batch: iter_time=2.765e-04, forward_time=0.209, loss_att=70.779, acc=0.939, loss=70.779, backward_time=0.328, grad_norm=96.588, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:47:59,136 (trainer:732) INFO: 17epoch:train:4771-5300batch: iter_time=2.772e-04, forward_time=0.209, loss_att=72.575, acc=0.938, loss=72.575, backward_time=0.328, grad_norm=101.818, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.881 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 06:54:18,448 (trainer:732) INFO: 17epoch:train:5301-5830batch: iter_time=2.744e-04, forward_time=0.208, loss_att=69.490, acc=0.939, loss=69.490, backward_time=0.327, grad_norm=102.906, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.864 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:00:39,275 (trainer:732) INFO: 17epoch:train:5831-6360batch: iter_time=2.890e-04, forward_time=0.209, loss_att=69.010, acc=0.939, loss=69.010, backward_time=0.328, grad_norm=104.953, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:06:58,718 (trainer:732) INFO: 17epoch:train:6361-6890batch: iter_time=2.868e-04, forward_time=0.208, loss_att=71.326, acc=0.938, loss=71.326, backward_time=0.326, grad_norm=92.161, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:13:18,081 (trainer:732) INFO: 17epoch:train:6891-7420batch: iter_time=2.575e-04, forward_time=0.207, loss_att=69.338, acc=0.938, loss=69.338, backward_time=0.325, grad_norm=90.361, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:19:35,409 (trainer:732) INFO: 17epoch:train:7421-7950batch: iter_time=2.512e-04, forward_time=0.207, loss_att=69.442, acc=0.938, loss=69.442, backward_time=0.324, grad_norm=89.551, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.849 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:25:54,268 (trainer:732) INFO: 17epoch:train:7951-8480batch: iter_time=2.820e-04, forward_time=0.208, loss_att=68.867, acc=0.939, loss=68.867, backward_time=0.326, grad_norm=95.973, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:32:13,349 (trainer:732) INFO: 17epoch:train:8481-9010batch: iter_time=2.698e-04, forward_time=0.208, loss_att=69.424, acc=0.939, loss=69.424, backward_time=0.326, grad_norm=101.083, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:38:35,078 (trainer:732) INFO: 17epoch:train:9011-9540batch: iter_time=2.580e-04, forward_time=0.209, loss_att=69.943, acc=0.939, loss=69.943, backward_time=0.328, grad_norm=95.126, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:44:54,830 (trainer:732) INFO: 17epoch:train:9541-10070batch: iter_time=2.567e-04, forward_time=0.208, loss_att=68.878, acc=0.939, loss=68.878, backward_time=0.327, grad_norm=93.312, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:51:15,634 (trainer:732) INFO: 17epoch:train:10071-10600batch: iter_time=2.618e-04, forward_time=0.208, loss_att=69.504, acc=0.940, loss=69.504, backward_time=0.327, grad_norm=98.535, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:59:23,885 (trainer:338) INFO: 17epoch results: [train] iter_time=3.838e-04, forward_time=0.208, loss_att=70.388, acc=0.939, loss=70.388, backward_time=0.327, grad_norm=95.749, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.933, time=2 hours, 9 minutes and 51.42 seconds, total_count=180455, gpu_max_cached_mem_GB=30.221, [valid] loss_att=69.352, acc=0.939, cer=0.077, wer=0.221, loss=69.352, time=4 minutes and 1.45 seconds, total_count=204, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 48.56 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:59:29,527 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:59:29,533 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/7epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 07:59:29,533 (trainer:272) INFO: 18/60epoch started. Estimated time to finish: 4 days, 3 hours and 18 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:08:38,770 (trainer:732) INFO: 18epoch:train:1-530batch: iter_time=0.004, forward_time=0.209, loss_att=67.658, acc=0.941, loss=67.658, backward_time=0.327, grad_norm=94.810, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=4.150 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:14:59,591 (trainer:732) INFO: 18epoch:train:531-1060batch: iter_time=3.108e-04, forward_time=0.209, loss_att=69.062, acc=0.941, loss=69.062, backward_time=0.327, grad_norm=89.512, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:21:17,915 (trainer:732) INFO: 18epoch:train:1061-1590batch: iter_time=2.756e-04, forward_time=0.207, loss_att=66.806, acc=0.941, loss=66.806, backward_time=0.325, grad_norm=89.977, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.856 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:27:39,447 (trainer:732) INFO: 18epoch:train:1591-2120batch: iter_time=2.838e-04, forward_time=0.209, loss_att=68.357, acc=0.941, loss=68.357, backward_time=0.328, grad_norm=94.780, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:33:59,940 (trainer:732) INFO: 18epoch:train:2121-2650batch: iter_time=3.006e-04, forward_time=0.209, loss_att=68.023, acc=0.940, loss=68.023, backward_time=0.327, grad_norm=102.896, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:40:20,501 (trainer:732) INFO: 18epoch:train:2651-3180batch: iter_time=2.873e-04, forward_time=0.208, loss_att=68.085, acc=0.940, loss=68.085, backward_time=0.326, grad_norm=93.572, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:46:41,587 (trainer:732) INFO: 18epoch:train:3181-3710batch: iter_time=2.666e-04, forward_time=0.209, loss_att=71.069, acc=0.940, loss=71.069, backward_time=0.328, grad_norm=92.299, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.877 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:53:00,509 (trainer:732) INFO: 18epoch:train:3711-4240batch: iter_time=2.900e-04, forward_time=0.208, loss_att=67.610, acc=0.940, loss=67.610, backward_time=0.326, grad_norm=91.984, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 08:59:20,364 (trainer:732) INFO: 18epoch:train:4241-4770batch: iter_time=3.088e-04, forward_time=0.209, loss_att=67.293, acc=0.941, loss=67.293, backward_time=0.326, grad_norm=98.109, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:05:41,120 (trainer:732) INFO: 18epoch:train:4771-5300batch: iter_time=2.822e-04, forward_time=0.209, loss_att=69.033, acc=0.940, loss=69.033, backward_time=0.327, grad_norm=97.652, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:12:00,886 (trainer:732) INFO: 18epoch:train:5301-5830batch: iter_time=2.886e-04, forward_time=0.208, loss_att=67.839, acc=0.941, loss=67.839, backward_time=0.326, grad_norm=92.553, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:18:21,342 (trainer:732) INFO: 18epoch:train:5831-6360batch: iter_time=2.882e-04, forward_time=0.209, loss_att=68.591, acc=0.940, loss=68.591, backward_time=0.327, grad_norm=92.582, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:24:42,773 (trainer:732) INFO: 18epoch:train:6361-6890batch: iter_time=2.839e-04, forward_time=0.209, loss_att=68.705, acc=0.941, loss=68.705, backward_time=0.328, grad_norm=96.925, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:31:03,160 (trainer:732) INFO: 18epoch:train:6891-7420batch: iter_time=3.034e-04, forward_time=0.209, loss_att=68.626, acc=0.941, loss=68.626, backward_time=0.327, grad_norm=92.704, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:37:22,348 (trainer:732) INFO: 18epoch:train:7421-7950batch: iter_time=2.681e-04, forward_time=0.208, loss_att=67.417, acc=0.941, loss=67.417, backward_time=0.326, grad_norm=104.490, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:43:41,874 (trainer:732) INFO: 18epoch:train:7951-8480batch: iter_time=2.690e-04, forward_time=0.208, loss_att=67.423, acc=0.940, loss=67.423, backward_time=0.326, grad_norm=90.582, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.862 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:50:02,131 (trainer:732) INFO: 18epoch:train:8481-9010batch: iter_time=2.889e-04, forward_time=0.208, loss_att=68.081, acc=0.940, loss=68.081, backward_time=0.327, grad_norm=99.758, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 09:56:21,286 (trainer:732) INFO: 18epoch:train:9011-9540batch: iter_time=2.842e-04, forward_time=0.207, loss_att=66.946, acc=0.941, loss=66.946, backward_time=0.325, grad_norm=93.270, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:02:40,979 (trainer:732) INFO: 18epoch:train:9541-10070batch: iter_time=2.638e-04, forward_time=0.208, loss_att=67.948, acc=0.941, loss=67.948, backward_time=0.327, grad_norm=102.385, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:09:03,759 (trainer:732) INFO: 18epoch:train:10071-10600batch: iter_time=2.915e-04, forward_time=0.210, loss_att=68.180, acc=0.941, loss=68.180, backward_time=0.329, grad_norm=99.516, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.887 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:17:03,532 (trainer:338) INFO: 18epoch results: [train] iter_time=4.684e-04, forward_time=0.209, loss_att=68.136, acc=0.941, loss=68.136, backward_time=0.327, grad_norm=95.520, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.933, time=2 hours, 9 minutes and 54.08 seconds, total_count=191070, gpu_max_cached_mem_GB=30.221, [valid] loss_att=67.592, acc=0.941, cer=0.073, wer=0.215, loss=67.592, time=3 minutes and 56.99 seconds, total_count=216, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 42.91 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:17:09,442 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:17:09,449 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/8epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:17:09,449 (trainer:272) INFO: 19/60epoch started. Estimated time to finish: 4 days, 55 minutes and 52.07 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:26:19,861 (trainer:732) INFO: 19epoch:train:1-530batch: iter_time=0.003, forward_time=0.209, loss_att=65.537, acc=0.942, loss=65.537, backward_time=0.327, grad_norm=90.097, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=4.158 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:32:41,656 (trainer:732) INFO: 19epoch:train:531-1060batch: iter_time=2.886e-04, forward_time=0.209, loss_att=67.076, acc=0.942, loss=67.076, backward_time=0.328, grad_norm=91.725, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.881 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:39:02,366 (trainer:732) INFO: 19epoch:train:1061-1590batch: iter_time=2.860e-04, forward_time=0.209, loss_att=65.963, acc=0.943, loss=65.963, backward_time=0.327, grad_norm=95.774, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:45:20,544 (trainer:732) INFO: 19epoch:train:1591-2120batch: iter_time=3.065e-04, forward_time=0.207, loss_att=65.317, acc=0.942, loss=65.317, backward_time=0.324, grad_norm=97.570, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.852 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:51:40,941 (trainer:732) INFO: 19epoch:train:2121-2650batch: iter_time=2.862e-04, forward_time=0.209, loss_att=66.101, acc=0.943, loss=66.101, backward_time=0.328, grad_norm=94.835, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 10:58:00,511 (trainer:732) INFO: 19epoch:train:2651-3180batch: iter_time=3.043e-04, forward_time=0.208, loss_att=66.156, acc=0.942, loss=66.156, backward_time=0.326, grad_norm=97.561, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:04:21,075 (trainer:732) INFO: 19epoch:train:3181-3710batch: iter_time=2.947e-04, forward_time=0.209, loss_att=66.623, acc=0.942, loss=66.623, backward_time=0.328, grad_norm=90.769, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:10:41,423 (trainer:732) INFO: 19epoch:train:3711-4240batch: iter_time=2.940e-04, forward_time=0.209, loss_att=65.592, acc=0.942, loss=65.592, backward_time=0.327, grad_norm=98.596, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:17:01,419 (trainer:732) INFO: 19epoch:train:4241-4770batch: iter_time=2.761e-04, forward_time=0.208, loss_att=66.522, acc=0.942, loss=66.522, backward_time=0.327, grad_norm=93.859, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:23:20,715 (trainer:732) INFO: 19epoch:train:4771-5300batch: iter_time=2.731e-04, forward_time=0.208, loss_att=65.818, acc=0.942, loss=65.818, backward_time=0.326, grad_norm=93.484, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:29:41,601 (trainer:732) INFO: 19epoch:train:5301-5830batch: iter_time=2.836e-04, forward_time=0.209, loss_att=69.979, acc=0.940, loss=69.979, backward_time=0.328, grad_norm=93.897, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:36:00,835 (trainer:732) INFO: 19epoch:train:5831-6360batch: iter_time=2.750e-04, forward_time=0.207, loss_att=65.844, acc=0.942, loss=65.844, backward_time=0.326, grad_norm=94.551, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:42:21,256 (trainer:732) INFO: 19epoch:train:6361-6890batch: iter_time=2.847e-04, forward_time=0.209, loss_att=66.219, acc=0.943, loss=66.219, backward_time=0.327, grad_norm=97.514, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:48:43,012 (trainer:732) INFO: 19epoch:train:6891-7420batch: iter_time=2.745e-04, forward_time=0.209, loss_att=66.279, acc=0.942, loss=66.279, backward_time=0.327, grad_norm=90.616, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 11:55:03,188 (trainer:732) INFO: 19epoch:train:7421-7950batch: iter_time=2.908e-04, forward_time=0.209, loss_att=65.715, acc=0.943, loss=65.715, backward_time=0.326, grad_norm=91.042, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:01:24,803 (trainer:732) INFO: 19epoch:train:7951-8480batch: iter_time=2.878e-04, forward_time=0.209, loss_att=66.867, acc=0.942, loss=66.867, backward_time=0.328, grad_norm=95.884, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:07:43,834 (trainer:732) INFO: 19epoch:train:8481-9010batch: iter_time=2.839e-04, forward_time=0.208, loss_att=66.245, acc=0.942, loss=66.245, backward_time=0.326, grad_norm=88.896, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.861 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:14:03,842 (trainer:732) INFO: 19epoch:train:9011-9540batch: iter_time=2.870e-04, forward_time=0.209, loss_att=65.817, acc=0.943, loss=65.817, backward_time=0.327, grad_norm=88.337, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:20:22,944 (trainer:732) INFO: 19epoch:train:9541-10070batch: iter_time=2.954e-04, forward_time=0.209, loss_att=65.665, acc=0.941, loss=65.665, backward_time=0.326, grad_norm=90.411, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.862 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:26:43,285 (trainer:732) INFO: 19epoch:train:10071-10600batch: iter_time=2.935e-04, forward_time=0.209, loss_att=66.197, acc=0.942, loss=66.197, backward_time=0.326, grad_norm=84.469, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:34:43,129 (trainer:338) INFO: 19epoch results: [train] iter_time=4.216e-04, forward_time=0.209, loss_att=66.266, acc=0.942, loss=66.266, backward_time=0.327, grad_norm=93.012, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.933, time=2 hours, 9 minutes and 52.36 seconds, total_count=201685, gpu_max_cached_mem_GB=30.221, [valid] loss_att=66.717, acc=0.941, cer=0.072, wer=0.211, loss=66.717, time=3 minutes and 59.79 seconds, total_count=228, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 41.53 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:34:49,371 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:34:49,382 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/9epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:34:49,383 (trainer:272) INFO: 20/60epoch started. Estimated time to finish: 3 days, 22 hours and 34 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:43:56,203 (trainer:732) INFO: 20epoch:train:1-530batch: iter_time=0.004, forward_time=0.209, loss_att=63.676, acc=0.944, loss=63.676, backward_time=0.326, grad_norm=99.832, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=4.131 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:50:18,161 (trainer:732) INFO: 20epoch:train:531-1060batch: iter_time=2.919e-04, forward_time=0.210, loss_att=64.947, acc=0.944, loss=64.947, backward_time=0.329, grad_norm=90.828, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.882 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 12:56:39,066 (trainer:732) INFO: 20epoch:train:1061-1590batch: iter_time=2.874e-04, forward_time=0.209, loss_att=64.633, acc=0.944, loss=64.633, backward_time=0.328, grad_norm=88.763, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:02:59,447 (trainer:732) INFO: 20epoch:train:1591-2120batch: iter_time=2.812e-04, forward_time=0.208, loss_att=64.194, acc=0.944, loss=64.194, backward_time=0.326, grad_norm=90.033, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:09:17,735 (trainer:732) INFO: 20epoch:train:2121-2650batch: iter_time=2.733e-04, forward_time=0.207, loss_att=63.635, acc=0.943, loss=63.635, backward_time=0.325, grad_norm=90.003, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:15:38,838 (trainer:732) INFO: 20epoch:train:2651-3180batch: iter_time=2.797e-04, forward_time=0.209, loss_att=65.576, acc=0.944, loss=65.576, backward_time=0.328, grad_norm=88.931, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:21:58,609 (trainer:732) INFO: 20epoch:train:3181-3710batch: iter_time=2.691e-04, forward_time=0.208, loss_att=64.547, acc=0.943, loss=64.547, backward_time=0.326, grad_norm=94.132, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:28:20,352 (trainer:732) INFO: 20epoch:train:3711-4240batch: iter_time=2.783e-04, forward_time=0.209, loss_att=66.075, acc=0.943, loss=66.075, backward_time=0.328, grad_norm=92.994, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.879 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:34:40,995 (trainer:732) INFO: 20epoch:train:4241-4770batch: iter_time=2.848e-04, forward_time=0.209, loss_att=63.843, acc=0.944, loss=63.843, backward_time=0.327, grad_norm=96.015, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:40:59,972 (trainer:732) INFO: 20epoch:train:4771-5300batch: iter_time=2.830e-04, forward_time=0.208, loss_att=63.788, acc=0.943, loss=63.788, backward_time=0.326, grad_norm=90.836, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.858 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:47:20,702 (trainer:732) INFO: 20epoch:train:5301-5830batch: iter_time=2.845e-04, forward_time=0.209, loss_att=64.630, acc=0.944, loss=64.630, backward_time=0.327, grad_norm=98.660, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 13:53:40,440 (trainer:732) INFO: 20epoch:train:5831-6360batch: iter_time=2.784e-04, forward_time=0.208, loss_att=63.402, acc=0.944, loss=63.402, backward_time=0.327, grad_norm=91.027, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:00:00,040 (trainer:732) INFO: 20epoch:train:6361-6890batch: iter_time=2.702e-04, forward_time=0.209, loss_att=64.943, acc=0.944, loss=64.943, backward_time=0.327, grad_norm=94.502, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:06:19,418 (trainer:732) INFO: 20epoch:train:6891-7420batch: iter_time=2.738e-04, forward_time=0.208, loss_att=64.295, acc=0.943, loss=64.295, backward_time=0.326, grad_norm=99.731, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:12:39,008 (trainer:732) INFO: 20epoch:train:7421-7950batch: iter_time=2.849e-04, forward_time=0.209, loss_att=63.206, acc=0.945, loss=63.206, backward_time=0.327, grad_norm=100.439, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:18:59,507 (trainer:732) INFO: 20epoch:train:7951-8480batch: iter_time=2.929e-04, forward_time=0.209, loss_att=64.442, acc=0.943, loss=64.442, backward_time=0.327, grad_norm=97.764, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:25:19,086 (trainer:732) INFO: 20epoch:train:8481-9010batch: iter_time=2.870e-04, forward_time=0.208, loss_att=63.817, acc=0.944, loss=63.817, backward_time=0.326, grad_norm=89.360, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.866 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:31:41,481 (trainer:732) INFO: 20epoch:train:9011-9540batch: iter_time=2.952e-04, forward_time=0.210, loss_att=64.797, acc=0.944, loss=64.797, backward_time=0.327, grad_norm=98.004, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.884 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:38:02,382 (trainer:732) INFO: 20epoch:train:9541-10070batch: iter_time=2.995e-04, forward_time=0.209, loss_att=64.861, acc=0.943, loss=64.861, backward_time=0.326, grad_norm=91.934, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:44:23,193 (trainer:732) INFO: 20epoch:train:10071-10600batch: iter_time=2.662e-04, forward_time=0.209, loss_att=63.991, acc=0.943, loss=63.991, backward_time=0.328, grad_norm=94.239, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:52:21,214 (trainer:338) INFO: 20epoch results: [train] iter_time=4.738e-04, forward_time=0.209, loss_att=64.366, acc=0.944, loss=64.366, backward_time=0.327, grad_norm=93.920, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.933, time=2 hours, 9 minutes and 52.49 seconds, total_count=212300, gpu_max_cached_mem_GB=30.221, [valid] loss_att=67.159, acc=0.942, cer=0.074, wer=0.213, loss=67.159, time=3 minutes and 59.78 seconds, total_count=240, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 39.55 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:52:26,852 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:52:26,860 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/10epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 14:52:26,860 (trainer:272) INFO: 21/60epoch started. Estimated time to finish: 3 days, 20 hours and 13 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:01:49,077 (trainer:732) INFO: 21epoch:train:1-530batch: iter_time=0.003, forward_time=0.210, loss_att=62.805, acc=0.946, loss=62.805, backward_time=0.328, grad_norm=89.578, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=4.249 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:08:09,240 (trainer:732) INFO: 21epoch:train:531-1060batch: iter_time=2.944e-04, forward_time=0.208, loss_att=62.005, acc=0.946, loss=62.005, backward_time=0.327, grad_norm=93.034, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:14:29,710 (trainer:732) INFO: 21epoch:train:1061-1590batch: iter_time=2.883e-04, forward_time=0.209, loss_att=62.287, acc=0.946, loss=62.287, backward_time=0.328, grad_norm=92.732, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.873 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:20:49,992 (trainer:732) INFO: 21epoch:train:1591-2120batch: iter_time=2.996e-04, forward_time=0.208, loss_att=62.958, acc=0.945, loss=62.958, backward_time=0.326, grad_norm=93.034, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.868 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:27:10,389 (trainer:732) INFO: 21epoch:train:2121-2650batch: iter_time=2.947e-04, forward_time=0.208, loss_att=62.681, acc=0.945, loss=62.681, backward_time=0.325, grad_norm=95.042, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:33:31,952 (trainer:732) INFO: 21epoch:train:2651-3180batch: iter_time=2.956e-04, forward_time=0.209, loss_att=63.316, acc=0.945, loss=63.316, backward_time=0.328, grad_norm=94.049, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.878 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:39:52,460 (trainer:732) INFO: 21epoch:train:3181-3710batch: iter_time=2.873e-04, forward_time=0.208, loss_att=63.099, acc=0.945, loss=63.099, backward_time=0.326, grad_norm=90.985, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.872 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:46:12,467 (trainer:732) INFO: 21epoch:train:3711-4240batch: iter_time=2.772e-04, forward_time=0.208, loss_att=62.132, acc=0.945, loss=62.132, backward_time=0.326, grad_norm=90.435, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:52:34,224 (trainer:732) INFO: 21epoch:train:4241-4770batch: iter_time=2.734e-04, forward_time=0.209, loss_att=64.258, acc=0.945, loss=64.258, backward_time=0.328, grad_norm=91.232, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.882 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 15:58:54,272 (trainer:732) INFO: 21epoch:train:4771-5300batch: iter_time=2.820e-04, forward_time=0.208, loss_att=64.577, acc=0.944, loss=64.577, backward_time=0.326, grad_norm=93.132, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.867 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:05:14,603 (trainer:732) INFO: 21epoch:train:5301-5830batch: iter_time=2.976e-04, forward_time=0.208, loss_att=62.318, acc=0.945, loss=62.318, backward_time=0.326, grad_norm=91.159, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:11:36,466 (trainer:732) INFO: 21epoch:train:5831-6360batch: iter_time=2.828e-04, forward_time=0.210, loss_att=62.987, acc=0.946, loss=62.987, backward_time=0.328, grad_norm=95.902, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.881 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:17:57,210 (trainer:732) INFO: 21epoch:train:6361-6890batch: iter_time=2.896e-04, forward_time=0.209, loss_att=61.584, acc=0.946, loss=61.584, backward_time=0.327, grad_norm=94.725, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:24:17,910 (trainer:732) INFO: 21epoch:train:6891-7420batch: iter_time=2.808e-04, forward_time=0.209, loss_att=62.033, acc=0.945, loss=62.033, backward_time=0.327, grad_norm=92.260, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.871 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:30:38,067 (trainer:732) INFO: 21epoch:train:7421-7950batch: iter_time=2.884e-04, forward_time=0.208, loss_att=63.769, acc=0.945, loss=63.769, backward_time=0.327, grad_norm=95.803, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=2.869 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:36:56,826 (trainer:732) INFO: 21epoch:train:7951-8480batch: iter_time=2.879e-04, forward_time=0.208, loss_att=61.501, acc=0.944, loss=61.501, backward_time=0.325, grad_norm=89.519, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.857 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:43:13,871 (trainer:732) INFO: 21epoch:train:8481-9010batch: iter_time=2.924e-04, forward_time=0.207, loss_att=62.130, acc=0.944, loss=62.130, backward_time=0.324, grad_norm=93.033, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.846 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:49:35,809 (trainer:732) INFO: 21epoch:train:9011-9540batch: iter_time=2.846e-04, forward_time=0.210, loss_att=64.219, acc=0.945, loss=64.219, backward_time=0.329, grad_norm=90.860, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.881 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 16:55:54,722 (trainer:732) INFO: 21epoch:train:9541-10070batch: iter_time=2.995e-04, forward_time=0.208, loss_att=62.474, acc=0.945, loss=62.474, backward_time=0.326, grad_norm=91.674, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.860 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:02:15,888 (trainer:732) INFO: 21epoch:train:10071-10600batch: iter_time=2.902e-04, forward_time=0.209, loss_att=63.420, acc=0.945, loss=63.420, backward_time=0.327, grad_norm=96.792, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:10:23,877 (trainer:338) INFO: 21epoch results: [train] iter_time=4.376e-04, forward_time=0.209, loss_att=62.812, acc=0.945, loss=62.812, backward_time=0.327, grad_norm=92.735, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.939, time=2 hours, 10 minutes and 9.39 seconds, total_count=222915, gpu_max_cached_mem_GB=30.221, [valid] loss_att=65.110, acc=0.943, cer=0.072, wer=0.208, loss=65.110, time=3 minutes and 56.12 seconds, total_count=252, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 51.49 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:10:29,986 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:10:29,994 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/11epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:10:29,994 (trainer:272) INFO: 22/60epoch started. Estimated time to finish: 3 days, 17 hours and 54 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:19:39,702 (trainer:732) INFO: 22epoch:train:1-530batch: iter_time=0.003, forward_time=0.211, loss_att=59.746, acc=0.947, loss=59.746, backward_time=0.327, grad_norm=92.342, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=4.154 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:25:59,247 (trainer:732) INFO: 22epoch:train:531-1060batch: iter_time=3.629e-04, forward_time=0.209, loss_att=61.029, acc=0.946, loss=61.029, backward_time=0.326, grad_norm=92.106, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.863 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:32:21,097 (trainer:732) INFO: 22epoch:train:1061-1590batch: iter_time=4.022e-04, forward_time=0.211, loss_att=61.672, acc=0.946, loss=61.672, backward_time=0.329, grad_norm=99.574, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.883 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:38:41,607 (trainer:732) INFO: 22epoch:train:1591-2120batch: iter_time=3.528e-04, forward_time=0.209, loss_att=60.527, acc=0.947, loss=60.527, backward_time=0.326, grad_norm=95.942, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.870 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:45:04,581 (trainer:732) INFO: 22epoch:train:2121-2650batch: iter_time=3.795e-04, forward_time=0.211, loss_att=61.240, acc=0.946, loss=61.240, backward_time=0.328, grad_norm=96.648, clip=100.000, loss_scale=1.000, optim_step_time=0.097, optim0_lr0=0.001, train_time=2.891 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:51:32,515 (trainer:732) INFO: 22epoch:train:2651-3180batch: iter_time=4.045e-04, forward_time=0.215, loss_att=64.039, acc=0.946, loss=64.039, backward_time=0.330, grad_norm=92.900, clip=100.000, loss_scale=1.000, optim_step_time=0.109, optim0_lr0=0.001, train_time=2.926 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 17:58:03,484 (trainer:732) INFO: 22epoch:train:3181-3710batch: iter_time=4.528e-04, forward_time=0.217, loss_att=62.012, acc=0.945, loss=62.012, backward_time=0.329, grad_norm=92.810, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:04:33,676 (trainer:732) INFO: 22epoch:train:3711-4240batch: iter_time=4.612e-04, forward_time=0.218, loss_att=60.377, acc=0.946, loss=60.377, backward_time=0.329, grad_norm=90.729, clip=100.000, loss_scale=1.000, optim_step_time=0.124, optim0_lr0=0.001, train_time=2.943 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:11:03,984 (trainer:732) INFO: 22epoch:train:4241-4770batch: iter_time=4.051e-04, forward_time=0.219, loss_att=60.900, acc=0.946, loss=60.900, backward_time=0.329, grad_norm=89.703, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=0.001, train_time=2.947 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:17:34,413 (trainer:732) INFO: 22epoch:train:4771-5300batch: iter_time=3.991e-04, forward_time=0.218, loss_att=61.881, acc=0.946, loss=61.881, backward_time=0.329, grad_norm=86.748, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=0.001, train_time=2.944 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:24:06,776 (trainer:732) INFO: 22epoch:train:5301-5830batch: iter_time=5.079e-04, forward_time=0.221, loss_att=61.608, acc=0.946, loss=61.608, backward_time=0.329, grad_norm=85.094, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.001, train_time=2.963 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:30:39,689 (trainer:732) INFO: 22epoch:train:5831-6360batch: iter_time=5.529e-04, forward_time=0.220, loss_att=62.043, acc=0.946, loss=62.043, backward_time=0.330, grad_norm=98.227, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=0.001, train_time=2.962 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:37:11,740 (trainer:732) INFO: 22epoch:train:6361-6890batch: iter_time=4.524e-04, forward_time=0.219, loss_att=61.574, acc=0.946, loss=61.574, backward_time=0.331, grad_norm=93.371, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=0.001, train_time=2.960 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:43:41,968 (trainer:732) INFO: 22epoch:train:6891-7420batch: iter_time=4.577e-04, forward_time=0.218, loss_att=61.834, acc=0.945, loss=61.834, backward_time=0.329, grad_norm=87.173, clip=100.000, loss_scale=1.000, optim_step_time=0.127, optim0_lr0=0.001, train_time=2.943 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:50:14,974 (trainer:732) INFO: 22epoch:train:7421-7950batch: iter_time=4.760e-04, forward_time=0.220, loss_att=61.833, acc=0.946, loss=61.833, backward_time=0.330, grad_norm=89.716, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.001, train_time=2.967 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 18:56:47,353 (trainer:732) INFO: 22epoch:train:7951-8480batch: iter_time=4.383e-04, forward_time=0.218, loss_att=61.675, acc=0.947, loss=61.675, backward_time=0.331, grad_norm=95.528, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.001, train_time=2.959 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:03:18,705 (trainer:732) INFO: 22epoch:train:8481-9010batch: iter_time=4.649e-04, forward_time=0.218, loss_att=61.064, acc=0.946, loss=61.064, backward_time=0.330, grad_norm=91.947, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.954 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:09:52,404 (trainer:732) INFO: 22epoch:train:9011-9540batch: iter_time=4.097e-04, forward_time=0.220, loss_att=62.569, acc=0.946, loss=62.569, backward_time=0.332, grad_norm=93.756, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.969 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:16:21,051 (trainer:732) INFO: 22epoch:train:9541-10070batch: iter_time=4.708e-04, forward_time=0.217, loss_att=60.172, acc=0.946, loss=60.172, backward_time=0.328, grad_norm=93.360, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=0.001, train_time=2.934 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:22:52,406 (trainer:732) INFO: 22epoch:train:10071-10600batch: iter_time=4.542e-04, forward_time=0.219, loss_att=59.604, acc=0.947, loss=59.604, backward_time=0.330, grad_norm=92.212, clip=100.000, loss_scale=1.000, optim_step_time=0.125, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:31:31,056 (trainer:338) INFO: 22epoch results: [train] iter_time=5.477e-04, forward_time=0.216, loss_att=61.363, acc=0.946, loss=61.363, backward_time=0.329, grad_norm=92.495, clip=100.000, loss_scale=1.000, optim_step_time=0.118, optim0_lr0=0.001, train_time=2.996, time=2 hours, 12 minutes and 44.77 seconds, total_count=233530, gpu_max_cached_mem_GB=30.221, [valid] loss_att=64.689, acc=0.943, cer=0.070, wer=0.206, loss=64.689, time=3 minutes and 57.72 seconds, total_count=264, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 18.57 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:31:38,573 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:31:38,583 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/12epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:31:38,583 (trainer:272) INFO: 23/60epoch started. Estimated time to finish: 3 days, 15 hours and 43 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:41:23,600 (trainer:732) INFO: 23epoch:train:1-530batch: iter_time=0.002, forward_time=0.226, loss_att=60.663, acc=0.947, loss=60.663, backward_time=0.333, grad_norm=94.548, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.001, train_time=4.421 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:47:59,477 (trainer:732) INFO: 23epoch:train:531-1060batch: iter_time=5.686e-04, forward_time=0.225, loss_att=59.775, acc=0.948, loss=59.775, backward_time=0.333, grad_norm=89.918, clip=100.000, loss_scale=1.000, optim_step_time=0.128, optim0_lr0=0.001, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 19:54:32,010 (trainer:732) INFO: 23epoch:train:1061-1590batch: iter_time=5.835e-04, forward_time=0.221, loss_att=58.850, acc=0.948, loss=58.850, backward_time=0.331, grad_norm=98.257, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.963 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:01:03,020 (trainer:732) INFO: 23epoch:train:1591-2120batch: iter_time=6.117e-04, forward_time=0.220, loss_att=59.748, acc=0.947, loss=59.748, backward_time=0.330, grad_norm=88.372, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=0.001, train_time=2.950 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:07:36,179 (trainer:732) INFO: 23epoch:train:2121-2650batch: iter_time=5.471e-04, forward_time=0.222, loss_att=58.342, acc=0.948, loss=58.342, backward_time=0.330, grad_norm=100.533, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.001, train_time=2.969 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:14:10,493 (trainer:732) INFO: 23epoch:train:2651-3180batch: iter_time=5.360e-04, forward_time=0.222, loss_att=60.575, acc=0.947, loss=60.575, backward_time=0.331, grad_norm=93.143, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.973 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:20:45,826 (trainer:732) INFO: 23epoch:train:3181-3710batch: iter_time=4.838e-04, forward_time=0.222, loss_att=60.781, acc=0.947, loss=60.781, backward_time=0.333, grad_norm=95.609, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:27:19,962 (trainer:732) INFO: 23epoch:train:3711-4240batch: iter_time=4.883e-04, forward_time=0.222, loss_att=60.162, acc=0.947, loss=60.162, backward_time=0.331, grad_norm=96.396, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=2.972 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:33:54,059 (trainer:732) INFO: 23epoch:train:4241-4770batch: iter_time=5.852e-04, forward_time=0.222, loss_att=60.048, acc=0.947, loss=60.048, backward_time=0.331, grad_norm=89.070, clip=100.000, loss_scale=1.000, optim_step_time=0.133, optim0_lr0=0.001, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:40:27,373 (trainer:732) INFO: 23epoch:train:4771-5300batch: iter_time=4.726e-04, forward_time=0.222, loss_att=59.975, acc=0.946, loss=59.975, backward_time=0.331, grad_norm=89.403, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.968 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:47:02,436 (trainer:732) INFO: 23epoch:train:5301-5830batch: iter_time=4.388e-04, forward_time=0.223, loss_att=60.998, acc=0.947, loss=60.998, backward_time=0.333, grad_norm=92.766, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.982 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 20:53:35,852 (trainer:732) INFO: 23epoch:train:5831-6360batch: iter_time=4.702e-04, forward_time=0.221, loss_att=60.080, acc=0.947, loss=60.080, backward_time=0.330, grad_norm=91.377, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=0.001, train_time=2.968 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:00:10,116 (trainer:732) INFO: 23epoch:train:6361-6890batch: iter_time=4.602e-04, forward_time=0.222, loss_att=59.913, acc=0.948, loss=59.913, backward_time=0.332, grad_norm=89.811, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=0.001, train_time=2.977 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:06:45,584 (trainer:732) INFO: 23epoch:train:6891-7420batch: iter_time=4.803e-04, forward_time=0.222, loss_att=61.172, acc=0.948, loss=61.172, backward_time=0.333, grad_norm=89.657, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.982 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:13:18,628 (trainer:732) INFO: 23epoch:train:7421-7950batch: iter_time=5.525e-04, forward_time=0.221, loss_att=59.625, acc=0.947, loss=59.625, backward_time=0.329, grad_norm=95.075, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:19:52,676 (trainer:732) INFO: 23epoch:train:7951-8480batch: iter_time=5.561e-04, forward_time=0.222, loss_att=59.726, acc=0.948, loss=59.726, backward_time=0.332, grad_norm=90.047, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.973 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:26:27,739 (trainer:732) INFO: 23epoch:train:8481-9010batch: iter_time=4.949e-04, forward_time=0.223, loss_att=59.904, acc=0.948, loss=59.904, backward_time=0.332, grad_norm=90.992, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.983 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:33:01,053 (trainer:732) INFO: 23epoch:train:9011-9540batch: iter_time=4.540e-04, forward_time=0.221, loss_att=60.813, acc=0.948, loss=60.813, backward_time=0.331, grad_norm=92.358, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:39:33,300 (trainer:732) INFO: 23epoch:train:9541-10070batch: iter_time=5.270e-04, forward_time=0.221, loss_att=59.640, acc=0.948, loss=59.640, backward_time=0.330, grad_norm=91.011, clip=100.000, loss_scale=1.000, optim_step_time=0.130, optim0_lr0=0.001, train_time=2.961 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:46:05,087 (trainer:732) INFO: 23epoch:train:10071-10600batch: iter_time=4.626e-04, forward_time=0.220, loss_att=59.776, acc=0.947, loss=59.776, backward_time=0.329, grad_norm=91.952, clip=100.000, loss_scale=1.000, optim_step_time=0.133, optim0_lr0=0.001, train_time=2.954 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:54:33,618 (trainer:338) INFO: 23epoch results: [train] iter_time=6.075e-04, forward_time=0.222, loss_att=60.014, acc=0.947, loss=60.014, backward_time=0.331, grad_norm=92.522, clip=100.000, loss_scale=1.000, optim_step_time=0.131, optim0_lr0=0.001, train_time=3.043, time=2 hours, 14 minutes and 46.38 seconds, total_count=244145, gpu_max_cached_mem_GB=30.221, [valid] loss_att=64.217, acc=0.944, cer=0.071, wer=0.206, loss=64.217, time=3 minutes and 51.29 seconds, total_count=276, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 17.36 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:54:41,141 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:54:41,150 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/13epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 21:54:41,151 (trainer:272) INFO: 24/60epoch started. Estimated time to finish: 3 days, 13 hours and 36 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:04:26,691 (trainer:732) INFO: 24epoch:train:1-530batch: iter_time=0.003, forward_time=0.225, loss_att=58.619, acc=0.949, loss=58.619, backward_time=0.332, grad_norm=91.528, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=4.425 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:11:05,483 (trainer:732) INFO: 24epoch:train:531-1060batch: iter_time=4.980e-04, forward_time=0.226, loss_att=56.377, acc=0.950, loss=56.377, backward_time=0.332, grad_norm=92.445, clip=100.000, loss_scale=1.000, optim_step_time=0.146, optim0_lr0=0.001, train_time=3.008 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:17:42,756 (trainer:732) INFO: 24epoch:train:1061-1590batch: iter_time=6.188e-04, forward_time=0.225, loss_att=58.658, acc=0.949, loss=58.658, backward_time=0.333, grad_norm=99.502, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=0.001, train_time=3.000 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:24:17,835 (trainer:732) INFO: 24epoch:train:1591-2120batch: iter_time=6.921e-04, forward_time=0.223, loss_att=59.308, acc=0.948, loss=59.308, backward_time=0.331, grad_norm=89.634, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=0.001, train_time=2.979 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:30:54,493 (trainer:732) INFO: 24epoch:train:2121-2650batch: iter_time=5.445e-04, forward_time=0.224, loss_att=60.419, acc=0.948, loss=60.419, backward_time=0.332, grad_norm=97.548, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:37:30,183 (trainer:732) INFO: 24epoch:train:2651-3180batch: iter_time=4.771e-04, forward_time=0.222, loss_att=58.571, acc=0.948, loss=58.571, backward_time=0.332, grad_norm=95.781, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:44:04,204 (trainer:732) INFO: 24epoch:train:3181-3710batch: iter_time=4.948e-04, forward_time=0.223, loss_att=59.861, acc=0.948, loss=59.861, backward_time=0.331, grad_norm=94.219, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=0.001, train_time=2.975 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:50:40,385 (trainer:732) INFO: 24epoch:train:3711-4240batch: iter_time=5.984e-04, forward_time=0.222, loss_att=58.799, acc=0.949, loss=58.799, backward_time=0.331, grad_norm=92.671, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.988 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 22:57:13,433 (trainer:732) INFO: 24epoch:train:4241-4770batch: iter_time=4.969e-04, forward_time=0.220, loss_att=59.322, acc=0.948, loss=59.322, backward_time=0.329, grad_norm=87.712, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.968 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:03:48,078 (trainer:732) INFO: 24epoch:train:4771-5300batch: iter_time=5.541e-04, forward_time=0.221, loss_att=58.451, acc=0.949, loss=58.451, backward_time=0.331, grad_norm=94.789, clip=100.000, loss_scale=1.000, optim_step_time=0.133, optim0_lr0=0.001, train_time=2.976 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:10:25,483 (trainer:732) INFO: 24epoch:train:5301-5830batch: iter_time=4.757e-04, forward_time=0.223, loss_att=59.661, acc=0.949, loss=59.661, backward_time=0.334, grad_norm=89.041, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.999 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:17:01,003 (trainer:732) INFO: 24epoch:train:5831-6360batch: iter_time=4.326e-04, forward_time=0.222, loss_att=59.458, acc=0.948, loss=59.458, backward_time=0.331, grad_norm=89.298, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=0.001, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:23:33,566 (trainer:732) INFO: 24epoch:train:6361-6890batch: iter_time=5.087e-04, forward_time=0.220, loss_att=58.005, acc=0.948, loss=58.005, backward_time=0.329, grad_norm=89.848, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.963 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:30:07,675 (trainer:732) INFO: 24epoch:train:6891-7420batch: iter_time=4.412e-04, forward_time=0.222, loss_att=57.778, acc=0.948, loss=57.778, backward_time=0.330, grad_norm=86.848, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=2.972 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:36:42,113 (trainer:732) INFO: 24epoch:train:7421-7950batch: iter_time=4.908e-04, forward_time=0.221, loss_att=57.909, acc=0.949, loss=57.909, backward_time=0.329, grad_norm=90.032, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.978 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:43:14,906 (trainer:732) INFO: 24epoch:train:7951-8480batch: iter_time=4.990e-04, forward_time=0.220, loss_att=58.173, acc=0.948, loss=58.173, backward_time=0.330, grad_norm=89.970, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=0.001, train_time=2.962 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:49:50,288 (trainer:732) INFO: 24epoch:train:8481-9010batch: iter_time=6.888e-04, forward_time=0.221, loss_att=59.463, acc=0.948, loss=59.463, backward_time=0.331, grad_norm=90.820, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-22 23:56:23,643 (trainer:732) INFO: 24epoch:train:9011-9540batch: iter_time=5.399e-04, forward_time=0.220, loss_att=58.959, acc=0.949, loss=58.959, backward_time=0.331, grad_norm=90.296, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:02:57,020 (trainer:732) INFO: 24epoch:train:9541-10070batch: iter_time=4.953e-04, forward_time=0.221, loss_att=58.959, acc=0.948, loss=58.959, backward_time=0.330, grad_norm=93.291, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.970 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:09:28,563 (trainer:732) INFO: 24epoch:train:10071-10600batch: iter_time=4.712e-04, forward_time=0.219, loss_att=58.361, acc=0.949, loss=58.361, backward_time=0.328, grad_norm=93.905, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=0.001, train_time=2.953 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:18:09,725 (trainer:338) INFO: 24epoch results: [train] iter_time=6.558e-04, forward_time=0.222, loss_att=58.748, acc=0.948, loss=58.748, backward_time=0.331, grad_norm=91.954, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=3.051, time=2 hours, 15 minutes and 6.6 seconds, total_count=254760, gpu_max_cached_mem_GB=30.221, [valid] loss_att=63.778, acc=0.944, cer=0.071, wer=0.204, loss=63.778, time=4 minutes and 10.28 seconds, total_count=288, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 11.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:18:17,459 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:18:17,471 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/14epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:18:17,472 (trainer:272) INFO: 25/60epoch started. Estimated time to finish: 3 days, 11 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:28:06,405 (trainer:732) INFO: 25epoch:train:1-530batch: iter_time=0.002, forward_time=0.227, loss_att=58.719, acc=0.951, loss=58.719, backward_time=0.334, grad_norm=94.612, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=4.449 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:34:43,216 (trainer:732) INFO: 25epoch:train:531-1060batch: iter_time=4.781e-04, forward_time=0.224, loss_att=57.619, acc=0.950, loss=57.619, backward_time=0.332, grad_norm=97.119, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:41:18,523 (trainer:732) INFO: 25epoch:train:1061-1590batch: iter_time=4.745e-04, forward_time=0.223, loss_att=57.429, acc=0.950, loss=57.429, backward_time=0.332, grad_norm=91.366, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=0.001, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:47:51,473 (trainer:732) INFO: 25epoch:train:1591-2120batch: iter_time=6.010e-04, forward_time=0.221, loss_att=56.747, acc=0.950, loss=56.747, backward_time=0.330, grad_norm=95.024, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=0.001, train_time=2.964 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 00:54:25,894 (trainer:732) INFO: 25epoch:train:2121-2650batch: iter_time=4.454e-04, forward_time=0.222, loss_att=57.094, acc=0.950, loss=57.094, backward_time=0.331, grad_norm=86.480, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.977 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:00:58,897 (trainer:732) INFO: 25epoch:train:2651-3180batch: iter_time=4.995e-04, forward_time=0.222, loss_att=56.673, acc=0.949, loss=56.673, backward_time=0.329, grad_norm=88.181, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.964 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:07:33,290 (trainer:732) INFO: 25epoch:train:3181-3710batch: iter_time=4.928e-04, forward_time=0.222, loss_att=57.312, acc=0.949, loss=57.312, backward_time=0.330, grad_norm=94.395, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.977 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:14:06,146 (trainer:732) INFO: 25epoch:train:3711-4240batch: iter_time=4.515e-04, forward_time=0.220, loss_att=56.581, acc=0.950, loss=56.581, backward_time=0.330, grad_norm=93.411, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=0.001, train_time=2.963 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:20:40,047 (trainer:732) INFO: 25epoch:train:4241-4770batch: iter_time=4.931e-04, forward_time=0.221, loss_att=58.015, acc=0.950, loss=58.015, backward_time=0.331, grad_norm=92.695, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=0.001, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:27:12,185 (trainer:732) INFO: 25epoch:train:4771-5300batch: iter_time=4.736e-04, forward_time=0.220, loss_att=56.341, acc=0.949, loss=56.341, backward_time=0.329, grad_norm=87.605, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.957 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:33:42,090 (trainer:732) INFO: 25epoch:train:5301-5830batch: iter_time=4.905e-04, forward_time=0.217, loss_att=58.189, acc=0.949, loss=58.189, backward_time=0.327, grad_norm=87.271, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.943 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:40:14,149 (trainer:732) INFO: 25epoch:train:5831-6360batch: iter_time=4.655e-04, forward_time=0.220, loss_att=57.760, acc=0.950, loss=57.760, backward_time=0.329, grad_norm=94.119, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.957 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:46:47,059 (trainer:732) INFO: 25epoch:train:6361-6890batch: iter_time=4.153e-04, forward_time=0.219, loss_att=57.629, acc=0.949, loss=57.629, backward_time=0.330, grad_norm=95.465, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=0.001, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:53:19,951 (trainer:732) INFO: 25epoch:train:6891-7420batch: iter_time=5.120e-04, forward_time=0.219, loss_att=58.692, acc=0.949, loss=58.692, backward_time=0.330, grad_norm=88.529, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.964 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 01:59:51,864 (trainer:732) INFO: 25epoch:train:7421-7950batch: iter_time=4.243e-04, forward_time=0.218, loss_att=58.487, acc=0.950, loss=58.487, backward_time=0.330, grad_norm=87.798, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.958 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:06:22,358 (trainer:732) INFO: 25epoch:train:7951-8480batch: iter_time=4.364e-04, forward_time=0.217, loss_att=56.273, acc=0.950, loss=56.273, backward_time=0.328, grad_norm=91.676, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.945 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:12:53,238 (trainer:732) INFO: 25epoch:train:8481-9010batch: iter_time=4.130e-04, forward_time=0.217, loss_att=57.992, acc=0.949, loss=57.992, backward_time=0.328, grad_norm=97.983, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:19:25,496 (trainer:732) INFO: 25epoch:train:9011-9540batch: iter_time=4.013e-04, forward_time=0.218, loss_att=57.757, acc=0.949, loss=57.757, backward_time=0.329, grad_norm=87.620, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.958 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:25:55,733 (trainer:732) INFO: 25epoch:train:9541-10070batch: iter_time=4.241e-04, forward_time=0.217, loss_att=58.063, acc=0.949, loss=58.063, backward_time=0.327, grad_norm=93.080, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.946 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:32:28,986 (trainer:732) INFO: 25epoch:train:10071-10600batch: iter_time=4.048e-04, forward_time=0.218, loss_att=57.053, acc=0.951, loss=57.053, backward_time=0.330, grad_norm=91.105, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:41:09,536 (trainer:338) INFO: 25epoch results: [train] iter_time=5.613e-04, forward_time=0.220, loss_att=57.517, acc=0.950, loss=57.517, backward_time=0.330, grad_norm=91.772, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=3.037, time=2 hours, 14 minutes and 31.41 seconds, total_count=265375, gpu_max_cached_mem_GB=30.221, [valid] loss_att=62.856, acc=0.945, cer=0.070, wer=0.203, loss=62.856, time=4 minutes and 14.91 seconds, total_count=300, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 5.74 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:41:17,090 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:41:17,102 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/15epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:41:17,102 (trainer:272) INFO: 26/60epoch started. Estimated time to finish: 3 days, 9 hours and 17 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:50:47,185 (trainer:732) INFO: 26epoch:train:1-530batch: iter_time=0.002, forward_time=0.217, loss_att=54.649, acc=0.951, loss=54.649, backward_time=0.328, grad_norm=87.410, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=4.308 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 02:57:16,383 (trainer:732) INFO: 26epoch:train:531-1060batch: iter_time=4.730e-04, forward_time=0.216, loss_att=56.067, acc=0.950, loss=56.067, backward_time=0.327, grad_norm=89.439, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.935 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:03:47,239 (trainer:732) INFO: 26epoch:train:1061-1590batch: iter_time=4.004e-04, forward_time=0.216, loss_att=56.176, acc=0.950, loss=56.176, backward_time=0.328, grad_norm=90.024, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:10:18,465 (trainer:732) INFO: 26epoch:train:1591-2120batch: iter_time=4.112e-04, forward_time=0.216, loss_att=57.348, acc=0.951, loss=57.348, backward_time=0.329, grad_norm=93.915, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:16:48,281 (trainer:732) INFO: 26epoch:train:2121-2650batch: iter_time=3.943e-04, forward_time=0.215, loss_att=54.501, acc=0.951, loss=54.501, backward_time=0.328, grad_norm=90.809, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.942 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:23:20,068 (trainer:732) INFO: 26epoch:train:2651-3180batch: iter_time=3.990e-04, forward_time=0.217, loss_att=56.969, acc=0.950, loss=56.969, backward_time=0.329, grad_norm=94.925, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.955 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:29:51,055 (trainer:732) INFO: 26epoch:train:3181-3710batch: iter_time=4.130e-04, forward_time=0.216, loss_att=56.606, acc=0.950, loss=56.606, backward_time=0.329, grad_norm=88.943, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.952 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:36:20,606 (trainer:732) INFO: 26epoch:train:3711-4240batch: iter_time=4.171e-04, forward_time=0.214, loss_att=56.708, acc=0.950, loss=56.708, backward_time=0.327, grad_norm=89.308, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.938 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:42:49,267 (trainer:732) INFO: 26epoch:train:4241-4770batch: iter_time=4.081e-04, forward_time=0.214, loss_att=55.608, acc=0.950, loss=55.608, backward_time=0.327, grad_norm=85.791, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.933 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:49:22,143 (trainer:732) INFO: 26epoch:train:4771-5300batch: iter_time=3.964e-04, forward_time=0.216, loss_att=57.467, acc=0.950, loss=57.467, backward_time=0.330, grad_norm=92.945, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.964 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 03:55:51,519 (trainer:732) INFO: 26epoch:train:5301-5830batch: iter_time=4.107e-04, forward_time=0.215, loss_att=56.240, acc=0.951, loss=56.240, backward_time=0.327, grad_norm=83.926, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:02:22,830 (trainer:732) INFO: 26epoch:train:5831-6360batch: iter_time=3.738e-04, forward_time=0.216, loss_att=56.237, acc=0.951, loss=56.237, backward_time=0.329, grad_norm=102.247, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:08:51,445 (trainer:732) INFO: 26epoch:train:6361-6890batch: iter_time=3.688e-04, forward_time=0.214, loss_att=56.598, acc=0.950, loss=56.598, backward_time=0.326, grad_norm=86.060, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.933 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:15:21,023 (trainer:732) INFO: 26epoch:train:6891-7420batch: iter_time=3.977e-04, forward_time=0.214, loss_att=57.463, acc=0.950, loss=57.463, backward_time=0.327, grad_norm=87.423, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.938 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:21:51,294 (trainer:732) INFO: 26epoch:train:7421-7950batch: iter_time=3.931e-04, forward_time=0.215, loss_att=57.096, acc=0.951, loss=57.096, backward_time=0.329, grad_norm=89.893, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.946 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:28:21,337 (trainer:732) INFO: 26epoch:train:7951-8480batch: iter_time=3.703e-04, forward_time=0.215, loss_att=56.442, acc=0.951, loss=56.442, backward_time=0.328, grad_norm=95.321, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.942 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:34:51,869 (trainer:732) INFO: 26epoch:train:8481-9010batch: iter_time=3.825e-04, forward_time=0.215, loss_att=57.736, acc=0.950, loss=57.736, backward_time=0.329, grad_norm=93.226, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.949 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:41:22,617 (trainer:732) INFO: 26epoch:train:9011-9540batch: iter_time=4.010e-04, forward_time=0.214, loss_att=57.764, acc=0.950, loss=57.764, backward_time=0.328, grad_norm=92.078, clip=100.000, loss_scale=1.000, optim_step_time=0.146, optim0_lr0=0.001, train_time=2.946 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:47:51,603 (trainer:732) INFO: 26epoch:train:9541-10070batch: iter_time=4.156e-04, forward_time=0.214, loss_att=54.729, acc=0.950, loss=54.729, backward_time=0.327, grad_norm=92.617, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.936 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 04:54:23,675 (trainer:732) INFO: 26epoch:train:10071-10600batch: iter_time=3.683e-04, forward_time=0.215, loss_att=56.726, acc=0.951, loss=56.726, backward_time=0.329, grad_norm=89.147, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.957 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:02:34,691 (trainer:338) INFO: 26epoch results: [train] iter_time=4.957e-04, forward_time=0.215, loss_att=56.450, acc=0.950, loss=56.450, backward_time=0.328, grad_norm=90.759, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=3.013, time=2 hours, 13 minutes and 22.47 seconds, total_count=275990, gpu_max_cached_mem_GB=30.221, [valid] loss_att=62.001, acc=0.946, cer=0.070, wer=0.201, loss=62.001, time=4 minutes and 3.75 seconds, total_count=312, gpu_max_cached_mem_GB=30.221, [att_plot] time=3 minutes and 51.37 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:02:40,689 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:02:40,729 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/16epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:02:40,729 (trainer:272) INFO: 27/60epoch started. Estimated time to finish: 3 days, 7 hours and 1 minute +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:12:05,409 (trainer:732) INFO: 27epoch:train:1-530batch: iter_time=0.003, forward_time=0.217, loss_att=54.652, acc=0.952, loss=54.652, backward_time=0.328, grad_norm=96.451, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=4.267 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:18:35,965 (trainer:732) INFO: 27epoch:train:531-1060batch: iter_time=3.873e-04, forward_time=0.215, loss_att=55.464, acc=0.952, loss=55.464, backward_time=0.328, grad_norm=87.955, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.946 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<60452> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<60690> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<41756> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<7618> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10886> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10894> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<32795> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<32531> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<17135> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<17477> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<63345> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 152) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<24938> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<16463> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<25284> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<24003> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<26219> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<26215> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<19182> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 155) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 155) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<48908> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 155) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 155) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10544> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<41088> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<41087> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<39522> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<40252> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<22027> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<22169> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<42454> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<29400> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<5932> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<5922> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:25:07,418 (trainer:732) INFO: 27epoch:train:1061-1590batch: iter_time=4.252e-04, forward_time=0.216, loss_att=55.346, acc=0.952, loss=55.346, backward_time=0.329, grad_norm=92.163, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.956 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<40941> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<41487> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 143) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:31:37,612 (trainer:732) INFO: 27epoch:train:1591-2120batch: iter_time=3.929e-04, forward_time=0.215, loss_att=55.150, acc=0.951, loss=55.150, backward_time=0.328, grad_norm=88.390, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.942 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:38:08,302 (trainer:732) INFO: 27epoch:train:2121-2650batch: iter_time=4.067e-04, forward_time=0.215, loss_att=54.027, acc=0.952, loss=54.027, backward_time=0.328, grad_norm=88.775, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.950 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:44:37,917 (trainer:732) INFO: 27epoch:train:2651-3180batch: iter_time=3.818e-04, forward_time=0.214, loss_att=56.320, acc=0.951, loss=56.320, backward_time=0.327, grad_norm=89.234, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.938 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:51:09,905 (trainer:732) INFO: 27epoch:train:3181-3710batch: iter_time=3.920e-04, forward_time=0.215, loss_att=55.023, acc=0.952, loss=55.023, backward_time=0.329, grad_norm=87.587, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.960 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 05:57:40,061 (trainer:732) INFO: 27epoch:train:3711-4240batch: iter_time=4.221e-04, forward_time=0.214, loss_att=55.819, acc=0.951, loss=55.819, backward_time=0.328, grad_norm=90.997, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.942 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:04:09,204 (trainer:732) INFO: 27epoch:train:4241-4770batch: iter_time=3.875e-04, forward_time=0.214, loss_att=56.857, acc=0.951, loss=56.857, backward_time=0.327, grad_norm=92.267, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.937 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:10:39,670 (trainer:732) INFO: 27epoch:train:4771-5300batch: iter_time=4.050e-04, forward_time=0.214, loss_att=55.409, acc=0.951, loss=55.409, backward_time=0.329, grad_norm=89.463, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.945 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:17:08,125 (trainer:732) INFO: 27epoch:train:5301-5830batch: iter_time=3.998e-04, forward_time=0.214, loss_att=56.604, acc=0.950, loss=56.604, backward_time=0.327, grad_norm=87.371, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.933 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:23:37,889 (trainer:732) INFO: 27epoch:train:5831-6360batch: iter_time=3.732e-04, forward_time=0.213, loss_att=55.276, acc=0.951, loss=55.276, backward_time=0.327, grad_norm=93.320, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.939 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:30:07,587 (trainer:732) INFO: 27epoch:train:6361-6890batch: iter_time=3.905e-04, forward_time=0.214, loss_att=56.086, acc=0.951, loss=56.086, backward_time=0.329, grad_norm=87.140, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.941 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:36:37,979 (trainer:732) INFO: 27epoch:train:6891-7420batch: iter_time=3.926e-04, forward_time=0.214, loss_att=55.417, acc=0.951, loss=55.417, backward_time=0.328, grad_norm=98.110, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.945 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:43:05,590 (trainer:732) INFO: 27epoch:train:7421-7950batch: iter_time=3.795e-04, forward_time=0.213, loss_att=54.649, acc=0.952, loss=54.649, backward_time=0.326, grad_norm=92.589, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.926 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:49:35,237 (trainer:732) INFO: 27epoch:train:7951-8480batch: iter_time=3.654e-04, forward_time=0.214, loss_att=54.139, acc=0.952, loss=54.139, backward_time=0.328, grad_norm=85.240, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.939 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 06:56:02,427 (trainer:732) INFO: 27epoch:train:8481-9010batch: iter_time=3.766e-04, forward_time=0.212, loss_att=56.242, acc=0.950, loss=56.242, backward_time=0.325, grad_norm=89.097, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.924 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:02:30,428 (trainer:732) INFO: 27epoch:train:9011-9540batch: iter_time=3.767e-04, forward_time=0.213, loss_att=55.439, acc=0.950, loss=55.439, backward_time=0.326, grad_norm=89.663, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.926 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:09:02,035 (trainer:732) INFO: 27epoch:train:9541-10070batch: iter_time=3.879e-04, forward_time=0.215, loss_att=55.952, acc=0.951, loss=55.952, backward_time=0.330, grad_norm=88.918, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.957 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:15:31,622 (trainer:732) INFO: 27epoch:train:10071-10600batch: iter_time=3.806e-04, forward_time=0.214, loss_att=54.988, acc=0.952, loss=54.988, backward_time=0.327, grad_norm=93.434, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.938 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:23:55,509 (trainer:338) INFO: 27epoch results: [train] iter_time=5.133e-04, forward_time=0.214, loss_att=55.447, acc=0.951, loss=55.447, backward_time=0.328, grad_norm=90.414, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=3.007, time=2 hours, 13 minutes and 6.54 seconds, total_count=286605, gpu_max_cached_mem_GB=30.221, [valid] loss_att=62.074, acc=0.946, cer=0.069, wer=0.199, loss=62.074, time=4 minutes and 0.76 seconds, total_count=324, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 7.47 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:24:03,016 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:24:03,026 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/17epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:24:03,026 (trainer:272) INFO: 28/60epoch started. Estimated time to finish: 3 days, 4 hours and 45 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:33:33,991 (trainer:732) INFO: 28epoch:train:1-530batch: iter_time=0.004, forward_time=0.218, loss_att=53.114, acc=0.953, loss=53.114, backward_time=0.328, grad_norm=86.528, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=4.314 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:40:05,458 (trainer:732) INFO: 28epoch:train:531-1060batch: iter_time=4.146e-04, forward_time=0.217, loss_att=52.055, acc=0.953, loss=52.055, backward_time=0.328, grad_norm=86.751, clip=100.000, loss_scale=1.000, optim_step_time=0.146, optim0_lr0=0.001, train_time=2.953 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:46:35,625 (trainer:732) INFO: 28epoch:train:1061-1590batch: iter_time=3.974e-04, forward_time=0.216, loss_att=54.665, acc=0.953, loss=54.665, backward_time=0.328, grad_norm=90.943, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.946 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:53:04,648 (trainer:732) INFO: 28epoch:train:1591-2120batch: iter_time=3.823e-04, forward_time=0.215, loss_att=53.964, acc=0.952, loss=53.964, backward_time=0.327, grad_norm=85.730, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.933 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 07:59:36,676 (trainer:732) INFO: 28epoch:train:2121-2650batch: iter_time=3.926e-04, forward_time=0.217, loss_att=54.631, acc=0.952, loss=54.631, backward_time=0.330, grad_norm=99.637, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.959 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:06:07,911 (trainer:732) INFO: 28epoch:train:2651-3180batch: iter_time=3.948e-04, forward_time=0.215, loss_att=53.689, acc=0.953, loss=53.689, backward_time=0.329, grad_norm=90.324, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.951 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:12:37,654 (trainer:732) INFO: 28epoch:train:3181-3710batch: iter_time=3.882e-04, forward_time=0.216, loss_att=53.495, acc=0.952, loss=53.495, backward_time=0.328, grad_norm=88.830, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.943 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:19:07,741 (trainer:732) INFO: 28epoch:train:3711-4240batch: iter_time=3.995e-04, forward_time=0.215, loss_att=53.843, acc=0.953, loss=53.843, backward_time=0.328, grad_norm=88.846, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.941 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:25:36,698 (trainer:732) INFO: 28epoch:train:4241-4770batch: iter_time=3.967e-04, forward_time=0.215, loss_att=55.318, acc=0.952, loss=55.318, backward_time=0.327, grad_norm=93.554, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.936 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:32:07,813 (trainer:732) INFO: 28epoch:train:4771-5300batch: iter_time=3.810e-04, forward_time=0.216, loss_att=55.273, acc=0.952, loss=55.273, backward_time=0.329, grad_norm=93.636, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.950 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:38:37,044 (trainer:732) INFO: 28epoch:train:5301-5830batch: iter_time=3.843e-04, forward_time=0.214, loss_att=54.613, acc=0.952, loss=54.613, backward_time=0.327, grad_norm=87.562, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.939 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:45:06,915 (trainer:732) INFO: 28epoch:train:5831-6360batch: iter_time=3.788e-04, forward_time=0.214, loss_att=54.991, acc=0.952, loss=54.991, backward_time=0.328, grad_norm=86.670, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:51:35,326 (trainer:732) INFO: 28epoch:train:6361-6890batch: iter_time=3.963e-04, forward_time=0.214, loss_att=54.870, acc=0.951, loss=54.870, backward_time=0.327, grad_norm=95.048, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.931 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 08:58:05,026 (trainer:732) INFO: 28epoch:train:6891-7420batch: iter_time=3.938e-04, forward_time=0.215, loss_att=54.578, acc=0.951, loss=54.578, backward_time=0.327, grad_norm=86.773, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.940 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:04:33,269 (trainer:732) INFO: 28epoch:train:7421-7950batch: iter_time=3.808e-04, forward_time=0.213, loss_att=54.219, acc=0.952, loss=54.219, backward_time=0.326, grad_norm=91.502, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.931 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:11:01,386 (trainer:732) INFO: 28epoch:train:7951-8480batch: iter_time=4.057e-04, forward_time=0.214, loss_att=54.572, acc=0.952, loss=54.572, backward_time=0.326, grad_norm=93.218, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.927 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:17:31,182 (trainer:732) INFO: 28epoch:train:8481-9010batch: iter_time=3.853e-04, forward_time=0.215, loss_att=55.316, acc=0.951, loss=55.316, backward_time=0.327, grad_norm=90.666, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.943 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:24:02,132 (trainer:732) INFO: 28epoch:train:9011-9540batch: iter_time=3.913e-04, forward_time=0.215, loss_att=55.461, acc=0.952, loss=55.461, backward_time=0.329, grad_norm=93.356, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=0.001, train_time=2.948 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:30:33,273 (trainer:732) INFO: 28epoch:train:9541-10070batch: iter_time=3.840e-04, forward_time=0.216, loss_att=55.400, acc=0.952, loss=55.400, backward_time=0.329, grad_norm=86.045, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=0.001, train_time=2.953 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:37:03,797 (trainer:732) INFO: 28epoch:train:10071-10600batch: iter_time=3.761e-04, forward_time=0.215, loss_att=54.942, acc=0.952, loss=54.942, backward_time=0.328, grad_norm=88.853, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=2.945 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:45:26,322 (trainer:338) INFO: 28epoch results: [train] iter_time=5.529e-04, forward_time=0.215, loss_att=54.452, acc=0.952, loss=54.452, backward_time=0.328, grad_norm=90.192, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=0.001, train_time=3.011, time=2 hours, 13 minutes and 16.45 seconds, total_count=297220, gpu_max_cached_mem_GB=30.221, [valid] loss_att=61.102, acc=0.947, cer=0.067, wer=0.194, loss=61.102, time=4 minutes and 2.73 seconds, total_count=336, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 4.1 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:45:32,627 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:45:32,663 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/18epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:45:32,664 (trainer:272) INFO: 29/60epoch started. Estimated time to finish: 3 days, 2 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 09:54:52,850 (trainer:732) INFO: 29epoch:train:1-530batch: iter_time=0.004, forward_time=0.215, loss_att=52.185, acc=0.953, loss=52.185, backward_time=0.327, grad_norm=88.415, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=4.233 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:01:12,757 (trainer:732) INFO: 29epoch:train:531-1060batch: iter_time=4.130e-04, forward_time=0.208, loss_att=52.649, acc=0.953, loss=52.649, backward_time=0.326, grad_norm=86.457, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.865 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:07:33,584 (trainer:732) INFO: 29epoch:train:1061-1590batch: iter_time=3.429e-04, forward_time=0.209, loss_att=53.774, acc=0.953, loss=53.774, backward_time=0.327, grad_norm=86.950, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.875 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:13:55,379 (trainer:732) INFO: 29epoch:train:1591-2120batch: iter_time=3.468e-04, forward_time=0.210, loss_att=52.232, acc=0.954, loss=52.232, backward_time=0.328, grad_norm=89.484, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.880 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:20:16,369 (trainer:732) INFO: 29epoch:train:2121-2650batch: iter_time=3.567e-04, forward_time=0.209, loss_att=52.851, acc=0.953, loss=52.851, backward_time=0.327, grad_norm=88.099, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:26:37,700 (trainer:732) INFO: 29epoch:train:2651-3180batch: iter_time=3.527e-04, forward_time=0.210, loss_att=53.797, acc=0.953, loss=53.797, backward_time=0.327, grad_norm=91.490, clip=100.000, loss_scale=1.000, optim_step_time=0.096, optim0_lr0=0.001, train_time=2.876 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:33:00,143 (trainer:732) INFO: 29epoch:train:3181-3710batch: iter_time=3.716e-04, forward_time=0.212, loss_att=53.729, acc=0.954, loss=53.729, backward_time=0.328, grad_norm=88.143, clip=100.000, loss_scale=1.000, optim_step_time=0.094, optim0_lr0=0.001, train_time=2.887 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:39:21,167 (trainer:732) INFO: 29epoch:train:3711-4240batch: iter_time=3.439e-04, forward_time=0.209, loss_att=53.227, acc=0.953, loss=53.227, backward_time=0.328, grad_norm=93.133, clip=100.000, loss_scale=1.000, optim_step_time=0.091, optim0_lr0=0.001, train_time=2.874 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:45:46,795 (trainer:732) INFO: 29epoch:train:4241-4770batch: iter_time=3.657e-04, forward_time=0.213, loss_att=54.257, acc=0.953, loss=54.257, backward_time=0.329, grad_norm=92.884, clip=100.000, loss_scale=1.000, optim_step_time=0.100, optim0_lr0=0.001, train_time=2.910 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:52:11,011 (trainer:732) INFO: 29epoch:train:4771-5300batch: iter_time=3.364e-04, forward_time=0.212, loss_att=54.024, acc=0.953, loss=54.024, backward_time=0.329, grad_norm=92.745, clip=100.000, loss_scale=1.000, optim_step_time=0.099, optim0_lr0=0.001, train_time=2.899 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 10:58:37,212 (trainer:732) INFO: 29epoch:train:5301-5830batch: iter_time=4.739e-04, forward_time=0.214, loss_att=53.746, acc=0.952, loss=53.746, backward_time=0.327, grad_norm=90.029, clip=100.000, loss_scale=1.000, optim_step_time=0.113, optim0_lr0=0.001, train_time=2.916 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:05:15,053 (trainer:732) INFO: 29epoch:train:5831-6360batch: iter_time=5.095e-04, forward_time=0.221, loss_att=53.458, acc=0.954, loss=53.458, backward_time=0.332, grad_norm=92.473, clip=100.000, loss_scale=1.000, optim_step_time=0.153, optim0_lr0=0.001, train_time=3.000 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:11:59,925 (trainer:732) INFO: 29epoch:train:6361-6890batch: iter_time=7.074e-04, forward_time=0.230, loss_att=53.986, acc=0.953, loss=53.986, backward_time=0.333, grad_norm=88.654, clip=100.000, loss_scale=1.000, optim_step_time=0.179, optim0_lr0=0.001, train_time=3.056 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:18:41,298 (trainer:732) INFO: 29epoch:train:6891-7420batch: iter_time=5.873e-04, forward_time=0.226, loss_att=53.241, acc=0.952, loss=53.241, backward_time=0.332, grad_norm=85.723, clip=100.000, loss_scale=1.000, optim_step_time=0.166, optim0_lr0=0.001, train_time=3.027 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:25:22,167 (trainer:732) INFO: 29epoch:train:7421-7950batch: iter_time=8.308e-04, forward_time=0.225, loss_att=54.472, acc=0.953, loss=54.472, backward_time=0.333, grad_norm=88.812, clip=100.000, loss_scale=1.000, optim_step_time=0.159, optim0_lr0=0.001, train_time=3.027 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:32:07,487 (trainer:732) INFO: 29epoch:train:7951-8480batch: iter_time=9.193e-04, forward_time=0.231, loss_att=53.750, acc=0.953, loss=53.750, backward_time=0.333, grad_norm=88.260, clip=100.000, loss_scale=1.000, optim_step_time=0.176, optim0_lr0=0.001, train_time=3.056 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:38:52,705 (trainer:732) INFO: 29epoch:train:8481-9010batch: iter_time=8.094e-04, forward_time=0.229, loss_att=54.358, acc=0.953, loss=54.358, backward_time=0.333, grad_norm=92.263, clip=100.000, loss_scale=1.000, optim_step_time=0.179, optim0_lr0=0.001, train_time=3.059 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:45:34,875 (trainer:732) INFO: 29epoch:train:9011-9540batch: iter_time=7.455e-04, forward_time=0.228, loss_att=53.429, acc=0.952, loss=53.429, backward_time=0.331, grad_norm=97.916, clip=100.000, loss_scale=1.000, optim_step_time=0.174, optim0_lr0=0.001, train_time=3.033 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:52:13,976 (trainer:732) INFO: 29epoch:train:9541-10070batch: iter_time=6.421e-04, forward_time=0.223, loss_att=54.086, acc=0.953, loss=54.086, backward_time=0.331, grad_norm=99.170, clip=100.000, loss_scale=1.000, optim_step_time=0.163, optim0_lr0=0.001, train_time=3.012 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 11:58:50,223 (trainer:732) INFO: 29epoch:train:10071-10600batch: iter_time=5.253e-04, forward_time=0.221, loss_att=53.900, acc=0.952, loss=53.900, backward_time=0.330, grad_norm=90.849, clip=100.000, loss_scale=1.000, optim_step_time=0.151, optim0_lr0=0.001, train_time=2.989 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:07:21,537 (trainer:338) INFO: 29epoch results: [train] iter_time=6.737e-04, forward_time=0.218, loss_att=53.550, acc=0.953, loss=53.550, backward_time=0.330, grad_norm=90.587, clip=100.000, loss_scale=1.000, optim_step_time=0.129, optim0_lr0=0.001, train_time=3.017, time=2 hours, 13 minutes and 36.41 seconds, total_count=307835, gpu_max_cached_mem_GB=30.221, [valid] loss_att=60.455, acc=0.947, cer=0.067, wer=0.194, loss=60.455, time=3 minutes and 54.08 seconds, total_count=348, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 18.39 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:07:29,065 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:07:29,076 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/19epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:07:29,077 (trainer:272) INFO: 30/60epoch started. Estimated time to finish: 3 days, 12 minutes and 42.24 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:17:21,621 (trainer:732) INFO: 30epoch:train:1-530batch: iter_time=0.003, forward_time=0.226, loss_att=51.534, acc=0.955, loss=51.534, backward_time=0.332, grad_norm=91.013, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=4.477 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:23:58,187 (trainer:732) INFO: 30epoch:train:531-1060batch: iter_time=6.127e-04, forward_time=0.225, loss_att=51.767, acc=0.953, loss=51.767, backward_time=0.332, grad_norm=83.950, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.992 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:30:35,180 (trainer:732) INFO: 30epoch:train:1061-1590batch: iter_time=5.074e-04, forward_time=0.224, loss_att=52.652, acc=0.954, loss=52.652, backward_time=0.333, grad_norm=88.203, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:37:18,145 (trainer:732) INFO: 30epoch:train:1591-2120batch: iter_time=6.466e-04, forward_time=0.228, loss_att=53.524, acc=0.954, loss=53.524, backward_time=0.335, grad_norm=96.116, clip=100.000, loss_scale=1.000, optim_step_time=0.156, optim0_lr0=0.001, train_time=3.039 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:43:53,969 (trainer:732) INFO: 30epoch:train:2121-2650batch: iter_time=4.523e-04, forward_time=0.223, loss_att=52.427, acc=0.954, loss=52.427, backward_time=0.331, grad_norm=89.480, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.988 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:50:29,360 (trainer:732) INFO: 30epoch:train:2651-3180batch: iter_time=4.765e-04, forward_time=0.221, loss_att=52.145, acc=0.954, loss=52.145, backward_time=0.331, grad_norm=89.899, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.983 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 12:57:02,953 (trainer:732) INFO: 30epoch:train:3181-3710batch: iter_time=5.152e-04, forward_time=0.221, loss_att=51.483, acc=0.954, loss=51.483, backward_time=0.330, grad_norm=92.175, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.971 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:03:39,521 (trainer:732) INFO: 30epoch:train:3711-4240batch: iter_time=5.836e-04, forward_time=0.221, loss_att=52.572, acc=0.954, loss=52.572, backward_time=0.332, grad_norm=89.905, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:10:13,014 (trainer:732) INFO: 30epoch:train:4241-4770batch: iter_time=4.889e-04, forward_time=0.220, loss_att=53.103, acc=0.954, loss=53.103, backward_time=0.331, grad_norm=90.271, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=0.001, train_time=2.971 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:16:46,492 (trainer:732) INFO: 30epoch:train:4771-5300batch: iter_time=4.859e-04, forward_time=0.220, loss_att=53.947, acc=0.952, loss=53.947, backward_time=0.330, grad_norm=87.754, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:23:20,599 (trainer:732) INFO: 30epoch:train:5301-5830batch: iter_time=5.414e-04, forward_time=0.222, loss_att=52.797, acc=0.953, loss=52.797, backward_time=0.331, grad_norm=95.461, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.975 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:29:56,180 (trainer:732) INFO: 30epoch:train:5831-6360batch: iter_time=4.926e-04, forward_time=0.222, loss_att=52.896, acc=0.954, loss=52.896, backward_time=0.332, grad_norm=101.175, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=0.001, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:36:31,632 (trainer:732) INFO: 30epoch:train:6361-6890batch: iter_time=5.180e-04, forward_time=0.222, loss_att=53.306, acc=0.953, loss=53.306, backward_time=0.332, grad_norm=90.123, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.986 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:43:06,923 (trainer:732) INFO: 30epoch:train:6891-7420batch: iter_time=5.254e-04, forward_time=0.222, loss_att=52.160, acc=0.954, loss=52.160, backward_time=0.331, grad_norm=92.026, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.981 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:49:41,118 (trainer:732) INFO: 30epoch:train:7421-7950batch: iter_time=4.745e-04, forward_time=0.221, loss_att=53.304, acc=0.953, loss=53.304, backward_time=0.330, grad_norm=88.342, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=0.001, train_time=2.976 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 13:56:14,256 (trainer:732) INFO: 30epoch:train:7951-8480batch: iter_time=5.819e-04, forward_time=0.220, loss_att=52.052, acc=0.953, loss=52.052, backward_time=0.329, grad_norm=83.984, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:02:48,040 (trainer:732) INFO: 30epoch:train:8481-9010batch: iter_time=6.237e-04, forward_time=0.221, loss_att=53.336, acc=0.953, loss=53.336, backward_time=0.330, grad_norm=88.939, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.972 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:09:23,359 (trainer:732) INFO: 30epoch:train:9011-9540batch: iter_time=5.527e-04, forward_time=0.222, loss_att=53.893, acc=0.953, loss=53.893, backward_time=0.331, grad_norm=95.391, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=0.001, train_time=2.982 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:15:56,768 (trainer:732) INFO: 30epoch:train:9541-10070batch: iter_time=4.671e-04, forward_time=0.220, loss_att=52.854, acc=0.953, loss=52.854, backward_time=0.330, grad_norm=92.027, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.971 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:22:34,274 (trainer:732) INFO: 30epoch:train:10071-10600batch: iter_time=4.768e-04, forward_time=0.222, loss_att=53.598, acc=0.954, loss=53.598, backward_time=0.334, grad_norm=86.296, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=2.997 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 145) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:31:13,955 (trainer:338) INFO: 30epoch results: [train] iter_time=6.488e-04, forward_time=0.222, loss_att=52.774, acc=0.954, loss=52.774, backward_time=0.331, grad_norm=90.612, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=0.001, train_time=3.058, time=2 hours, 15 minutes and 26.3 seconds, total_count=318450, gpu_max_cached_mem_GB=30.221, [valid] loss_att=62.308, acc=0.946, cer=0.068, wer=0.198, loss=62.308, time=3 minutes and 56.45 seconds, total_count=360, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 22.13 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:31:21,305 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:31:21,315 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/20epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:31:21,315 (trainer:272) INFO: 31/60epoch started. Estimated time to finish: 2 days, 21 hours and 58 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:41:14,853 (trainer:732) INFO: 31epoch:train:1-530batch: iter_time=0.005, forward_time=0.226, loss_att=50.783, acc=0.955, loss=50.783, backward_time=0.333, grad_norm=88.344, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=4.485 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:47:51,521 (trainer:732) INFO: 31epoch:train:531-1060batch: iter_time=6.248e-04, forward_time=0.224, loss_att=51.384, acc=0.955, loss=51.384, backward_time=0.333, grad_norm=94.609, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=0.001, train_time=2.992 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 14:54:26,862 (trainer:732) INFO: 31epoch:train:1061-1590batch: iter_time=5.166e-04, forward_time=0.223, loss_att=50.283, acc=0.955, loss=50.283, backward_time=0.332, grad_norm=86.585, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=0.001, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:01:00,784 (trainer:732) INFO: 31epoch:train:1591-2120batch: iter_time=5.453e-04, forward_time=0.220, loss_att=51.672, acc=0.954, loss=51.672, backward_time=0.330, grad_norm=92.035, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=9.997e-04, train_time=2.971 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:07:38,336 (trainer:732) INFO: 31epoch:train:2121-2650batch: iter_time=5.670e-04, forward_time=0.224, loss_att=51.865, acc=0.955, loss=51.865, backward_time=0.331, grad_norm=91.236, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.988e-04, train_time=3.001 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:14:17,355 (trainer:732) INFO: 31epoch:train:2651-3180batch: iter_time=5.078e-04, forward_time=0.224, loss_att=53.117, acc=0.954, loss=53.117, backward_time=0.333, grad_norm=91.125, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.980e-04, train_time=3.009 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:20:54,231 (trainer:732) INFO: 31epoch:train:3181-3710batch: iter_time=5.522e-04, forward_time=0.223, loss_att=52.650, acc=0.955, loss=52.650, backward_time=0.332, grad_norm=93.060, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.972e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:27:31,361 (trainer:732) INFO: 31epoch:train:3711-4240batch: iter_time=5.183e-04, forward_time=0.223, loss_att=51.927, acc=0.954, loss=51.927, backward_time=0.331, grad_norm=84.562, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.964e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:34:06,797 (trainer:732) INFO: 31epoch:train:4241-4770batch: iter_time=5.475e-04, forward_time=0.222, loss_att=51.753, acc=0.954, loss=51.753, backward_time=0.331, grad_norm=87.486, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.955e-04, train_time=2.986 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:40:40,690 (trainer:732) INFO: 31epoch:train:4771-5300batch: iter_time=4.950e-04, forward_time=0.220, loss_att=51.491, acc=0.954, loss=51.491, backward_time=0.329, grad_norm=86.226, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.947e-04, train_time=2.970 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:47:19,017 (trainer:732) INFO: 31epoch:train:5301-5830batch: iter_time=6.024e-04, forward_time=0.224, loss_att=52.225, acc=0.954, loss=52.225, backward_time=0.332, grad_norm=94.324, clip=100.000, loss_scale=1.000, optim_step_time=0.152, optim0_lr0=9.939e-04, train_time=3.007 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 15:54:00,361 (trainer:732) INFO: 31epoch:train:5831-6360batch: iter_time=6.237e-04, forward_time=0.229, loss_att=52.463, acc=0.954, loss=52.463, backward_time=0.333, grad_norm=89.412, clip=100.000, loss_scale=1.000, optim_step_time=0.155, optim0_lr0=9.931e-04, train_time=3.026 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:00:34,204 (trainer:732) INFO: 31epoch:train:6361-6890batch: iter_time=5.172e-04, forward_time=0.220, loss_att=51.379, acc=0.954, loss=51.379, backward_time=0.330, grad_norm=95.620, clip=100.000, loss_scale=1.000, optim_step_time=0.136, optim0_lr0=9.923e-04, train_time=2.973 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:07:06,651 (trainer:732) INFO: 31epoch:train:6891-7420batch: iter_time=5.165e-04, forward_time=0.220, loss_att=52.215, acc=0.953, loss=52.215, backward_time=0.328, grad_norm=87.036, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.915e-04, train_time=2.960 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:13:41,414 (trainer:732) INFO: 31epoch:train:7421-7950batch: iter_time=4.642e-04, forward_time=0.222, loss_att=51.379, acc=0.954, loss=51.379, backward_time=0.330, grad_norm=89.695, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.907e-04, train_time=2.980 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:20:18,470 (trainer:732) INFO: 31epoch:train:7951-8480batch: iter_time=4.786e-04, forward_time=0.222, loss_att=51.976, acc=0.955, loss=51.976, backward_time=0.332, grad_norm=90.334, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.899e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:26:52,792 (trainer:732) INFO: 31epoch:train:8481-9010batch: iter_time=5.419e-04, forward_time=0.221, loss_att=52.284, acc=0.953, loss=52.284, backward_time=0.330, grad_norm=86.920, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.891e-04, train_time=2.977 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.245<54027> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<17236> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:33:27,387 (trainer:732) INFO: 31epoch:train:9011-9540batch: iter_time=4.374e-04, forward_time=0.221, loss_att=52.741, acc=0.954, loss=52.741, backward_time=0.331, grad_norm=93.575, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.883e-04, train_time=2.976 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:40:00,684 (trainer:732) INFO: 31epoch:train:9541-10070batch: iter_time=4.286e-04, forward_time=0.219, loss_att=52.042, acc=0.954, loss=52.042, backward_time=0.330, grad_norm=88.349, clip=100.000, loss_scale=1.000, optim_step_time=0.133, optim0_lr0=9.875e-04, train_time=2.970 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:46:34,070 (trainer:732) INFO: 31epoch:train:10071-10600batch: iter_time=5.372e-04, forward_time=0.219, loss_att=53.105, acc=0.954, loss=53.105, backward_time=0.330, grad_norm=88.875, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=9.867e-04, train_time=2.966 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:55:19,134 (trainer:338) INFO: 31epoch results: [train] iter_time=7.453e-04, forward_time=0.222, loss_att=51.935, acc=0.954, loss=51.935, backward_time=0.331, grad_norm=89.969, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.943e-04, train_time=3.061, time=2 hours, 15 minutes and 33.68 seconds, total_count=329065, gpu_max_cached_mem_GB=30.221, [valid] loss_att=61.234, acc=0.947, cer=0.066, wer=0.194, loss=61.234, time=4 minutes and 1.17 seconds, total_count=372, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 22.96 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:55:26,320 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:55:26,330 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/21epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 16:55:26,331 (trainer:272) INFO: 32/60epoch started. Estimated time to finish: 2 days, 19 hours and 43 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:05:13,188 (trainer:732) INFO: 32epoch:train:1-530batch: iter_time=0.003, forward_time=0.224, loss_att=50.481, acc=0.956, loss=50.481, backward_time=0.331, grad_norm=92.650, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=9.859e-04, train_time=4.435 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:11:45,673 (trainer:732) INFO: 32epoch:train:531-1060batch: iter_time=4.886e-04, forward_time=0.220, loss_att=50.343, acc=0.955, loss=50.343, backward_time=0.328, grad_norm=88.617, clip=100.000, loss_scale=1.000, optim_step_time=0.132, optim0_lr0=9.851e-04, train_time=2.960 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:18:19,624 (trainer:732) INFO: 32epoch:train:1061-1590batch: iter_time=5.463e-04, forward_time=0.221, loss_att=49.705, acc=0.956, loss=49.705, backward_time=0.330, grad_norm=90.816, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=9.843e-04, train_time=2.973 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:24:56,793 (trainer:732) INFO: 32epoch:train:1591-2120batch: iter_time=4.820e-04, forward_time=0.223, loss_att=50.743, acc=0.956, loss=50.743, backward_time=0.333, grad_norm=88.432, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.835e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:31:34,072 (trainer:732) INFO: 32epoch:train:2121-2650batch: iter_time=5.002e-04, forward_time=0.223, loss_att=50.192, acc=0.956, loss=50.192, backward_time=0.332, grad_norm=87.361, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.827e-04, train_time=3.000 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:38:09,009 (trainer:732) INFO: 32epoch:train:2651-3180batch: iter_time=5.461e-04, forward_time=0.222, loss_att=52.124, acc=0.954, loss=52.124, backward_time=0.330, grad_norm=89.124, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.819e-04, train_time=2.978 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:44:41,780 (trainer:732) INFO: 32epoch:train:3181-3710batch: iter_time=4.305e-04, forward_time=0.220, loss_att=50.988, acc=0.954, loss=50.988, backward_time=0.329, grad_norm=87.549, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.811e-04, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:51:19,160 (trainer:732) INFO: 32epoch:train:3711-4240batch: iter_time=5.597e-04, forward_time=0.223, loss_att=50.951, acc=0.955, loss=50.951, backward_time=0.333, grad_norm=99.021, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.804e-04, train_time=2.997 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 17:57:55,435 (trainer:732) INFO: 32epoch:train:4241-4770batch: iter_time=5.940e-04, forward_time=0.223, loss_att=51.632, acc=0.955, loss=51.632, backward_time=0.331, grad_norm=88.272, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.796e-04, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:04:31,468 (trainer:732) INFO: 32epoch:train:4771-5300batch: iter_time=5.530e-04, forward_time=0.222, loss_att=51.134, acc=0.955, loss=51.134, backward_time=0.330, grad_norm=95.225, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.788e-04, train_time=2.987 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:11:07,718 (trainer:732) INFO: 32epoch:train:5301-5830batch: iter_time=4.736e-04, forward_time=0.222, loss_att=50.550, acc=0.955, loss=50.550, backward_time=0.330, grad_norm=85.216, clip=100.000, loss_scale=1.000, optim_step_time=0.149, optim0_lr0=9.780e-04, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:17:44,877 (trainer:732) INFO: 32epoch:train:5831-6360batch: iter_time=4.052e-04, forward_time=0.223, loss_att=52.401, acc=0.954, loss=52.401, backward_time=0.332, grad_norm=85.313, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.773e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:24:21,200 (trainer:732) INFO: 32epoch:train:6361-6890batch: iter_time=5.631e-04, forward_time=0.222, loss_att=51.269, acc=0.955, loss=51.269, backward_time=0.331, grad_norm=90.058, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.765e-04, train_time=2.992 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:30:57,532 (trainer:732) INFO: 32epoch:train:6891-7420batch: iter_time=4.948e-04, forward_time=0.222, loss_att=50.899, acc=0.954, loss=50.899, backward_time=0.331, grad_norm=89.221, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.757e-04, train_time=2.989 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:37:34,943 (trainer:732) INFO: 32epoch:train:7421-7950batch: iter_time=5.041e-04, forward_time=0.223, loss_att=51.932, acc=0.955, loss=51.932, backward_time=0.331, grad_norm=89.306, clip=100.000, loss_scale=1.000, optim_step_time=0.146, optim0_lr0=9.749e-04, train_time=3.000 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:44:11,103 (trainer:732) INFO: 32epoch:train:7951-8480batch: iter_time=5.166e-04, forward_time=0.222, loss_att=51.496, acc=0.955, loss=51.496, backward_time=0.331, grad_norm=94.050, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.742e-04, train_time=2.988 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:50:46,865 (trainer:732) INFO: 32epoch:train:8481-9010batch: iter_time=4.424e-04, forward_time=0.222, loss_att=51.345, acc=0.955, loss=51.345, backward_time=0.330, grad_norm=88.620, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.734e-04, train_time=2.988 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 18:57:26,060 (trainer:732) INFO: 32epoch:train:9011-9540batch: iter_time=4.960e-04, forward_time=0.223, loss_att=51.364, acc=0.956, loss=51.364, backward_time=0.333, grad_norm=92.255, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.726e-04, train_time=3.010 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:04:04,088 (trainer:732) INFO: 32epoch:train:9541-10070batch: iter_time=5.572e-04, forward_time=0.224, loss_att=51.874, acc=0.955, loss=51.874, backward_time=0.332, grad_norm=87.268, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.719e-04, train_time=3.006 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:10:39,901 (trainer:732) INFO: 32epoch:train:10071-10600batch: iter_time=4.144e-04, forward_time=0.222, loss_att=51.020, acc=0.955, loss=51.020, backward_time=0.331, grad_norm=90.074, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.711e-04, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:19:25,462 (trainer:338) INFO: 32epoch results: [train] iter_time=6.214e-04, forward_time=0.222, loss_att=51.117, acc=0.955, loss=51.117, backward_time=0.331, grad_norm=89.918, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.784e-04, train_time=3.061, time=2 hours, 15 minutes and 33.62 seconds, total_count=339680, gpu_max_cached_mem_GB=30.221, [valid] loss_att=61.054, acc=0.947, cer=0.068, wer=0.194, loss=61.054, time=3 minutes and 58.2 seconds, total_count=384, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 27.31 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:19:32,847 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:19:32,862 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/22epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:19:32,862 (trainer:272) INFO: 33/60epoch started. Estimated time to finish: 2 days, 17 hours and 28 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:29:29,956 (trainer:732) INFO: 33epoch:train:1-530batch: iter_time=0.003, forward_time=0.230, loss_att=49.856, acc=0.957, loss=49.856, backward_time=0.334, grad_norm=95.084, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.704e-04, train_time=4.512 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:36:07,001 (trainer:732) INFO: 33epoch:train:531-1060batch: iter_time=5.111e-04, forward_time=0.224, loss_att=50.000, acc=0.956, loss=50.000, backward_time=0.331, grad_norm=86.252, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.696e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:42:44,148 (trainer:732) INFO: 33epoch:train:1061-1590batch: iter_time=5.422e-04, forward_time=0.225, loss_att=49.810, acc=0.956, loss=49.810, backward_time=0.332, grad_norm=83.885, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.688e-04, train_time=2.998 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:49:23,636 (trainer:732) INFO: 33epoch:train:1591-2120batch: iter_time=5.032e-04, forward_time=0.226, loss_att=49.461, acc=0.957, loss=49.461, backward_time=0.334, grad_norm=96.761, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.681e-04, train_time=3.012 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 19:56:00,184 (trainer:732) INFO: 33epoch:train:2121-2650batch: iter_time=5.269e-04, forward_time=0.224, loss_att=49.654, acc=0.956, loss=49.654, backward_time=0.331, grad_norm=94.917, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.673e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:02:37,344 (trainer:732) INFO: 33epoch:train:2651-3180batch: iter_time=5.373e-04, forward_time=0.224, loss_att=49.093, acc=0.956, loss=49.093, backward_time=0.331, grad_norm=91.987, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.666e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:09:13,299 (trainer:732) INFO: 33epoch:train:3181-3710batch: iter_time=5.045e-04, forward_time=0.223, loss_att=49.341, acc=0.956, loss=49.341, backward_time=0.332, grad_norm=95.646, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.658e-04, train_time=2.989 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:15:49,768 (trainer:732) INFO: 33epoch:train:3711-4240batch: iter_time=4.874e-04, forward_time=0.222, loss_att=51.749, acc=0.955, loss=51.749, backward_time=0.330, grad_norm=94.277, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.651e-04, train_time=2.990 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:22:26,614 (trainer:732) INFO: 33epoch:train:4241-4770batch: iter_time=4.777e-04, forward_time=0.223, loss_att=51.294, acc=0.956, loss=51.294, backward_time=0.331, grad_norm=96.960, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.644e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:29:01,860 (trainer:732) INFO: 33epoch:train:4771-5300batch: iter_time=4.264e-04, forward_time=0.222, loss_att=50.390, acc=0.956, loss=50.390, backward_time=0.331, grad_norm=93.954, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.636e-04, train_time=2.981 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:35:36,951 (trainer:732) INFO: 33epoch:train:5301-5830batch: iter_time=5.317e-04, forward_time=0.222, loss_att=49.990, acc=0.956, loss=49.990, backward_time=0.330, grad_norm=89.618, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=9.629e-04, train_time=2.983 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:42:14,159 (trainer:732) INFO: 33epoch:train:5831-6360batch: iter_time=5.393e-04, forward_time=0.223, loss_att=50.931, acc=0.955, loss=50.931, backward_time=0.331, grad_norm=90.088, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.621e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:48:48,023 (trainer:732) INFO: 33epoch:train:6361-6890batch: iter_time=5.296e-04, forward_time=0.220, loss_att=51.445, acc=0.954, loss=51.445, backward_time=0.329, grad_norm=85.722, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.614e-04, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 20:55:23,485 (trainer:732) INFO: 33epoch:train:6891-7420batch: iter_time=5.123e-04, forward_time=0.221, loss_att=51.295, acc=0.955, loss=51.295, backward_time=0.331, grad_norm=84.908, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.607e-04, train_time=2.982 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:01:56,770 (trainer:732) INFO: 33epoch:train:7421-7950batch: iter_time=4.840e-04, forward_time=0.220, loss_att=50.733, acc=0.954, loss=50.733, backward_time=0.329, grad_norm=90.839, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.599e-04, train_time=2.970 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:08:31,230 (trainer:732) INFO: 33epoch:train:7951-8480batch: iter_time=4.383e-04, forward_time=0.220, loss_att=50.392, acc=0.955, loss=50.392, backward_time=0.329, grad_norm=86.532, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.592e-04, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:15:07,070 (trainer:732) INFO: 33epoch:train:8481-9010batch: iter_time=5.277e-04, forward_time=0.222, loss_att=51.166, acc=0.955, loss=51.166, backward_time=0.330, grad_norm=89.209, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.585e-04, train_time=2.988 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:21:42,673 (trainer:732) INFO: 33epoch:train:9011-9540batch: iter_time=4.960e-04, forward_time=0.221, loss_att=50.854, acc=0.955, loss=50.854, backward_time=0.331, grad_norm=85.319, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.577e-04, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:28:16,900 (trainer:732) INFO: 33epoch:train:9541-10070batch: iter_time=5.070e-04, forward_time=0.221, loss_att=51.720, acc=0.955, loss=51.720, backward_time=0.331, grad_norm=94.035, clip=100.000, loss_scale=1.000, optim_step_time=0.134, optim0_lr0=9.570e-04, train_time=2.975 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:34:51,092 (trainer:732) INFO: 33epoch:train:10071-10600batch: iter_time=4.982e-04, forward_time=0.220, loss_att=49.523, acc=0.955, loss=49.523, backward_time=0.329, grad_norm=84.249, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.563e-04, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:43:35,246 (trainer:338) INFO: 33epoch results: [train] iter_time=6.499e-04, forward_time=0.223, loss_att=50.431, acc=0.956, loss=50.431, backward_time=0.331, grad_norm=90.540, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.633e-04, train_time=3.063, time=2 hours, 15 minutes and 37.43 seconds, total_count=350295, gpu_max_cached_mem_GB=30.221, [valid] loss_att=60.518, acc=0.947, cer=0.065, wer=0.193, loss=60.518, time=3 minutes and 53.99 seconds, total_count=396, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 30.97 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:43:42,753 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:43:42,764 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/23epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:43:42,764 (trainer:272) INFO: 34/60epoch started. Estimated time to finish: 2 days, 15 hours and 12 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 21:53:39,054 (trainer:732) INFO: 34epoch:train:1-530batch: iter_time=0.004, forward_time=0.226, loss_att=49.212, acc=0.957, loss=49.212, backward_time=0.333, grad_norm=94.021, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=9.555e-04, train_time=4.507 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:00:17,584 (trainer:732) INFO: 34epoch:train:531-1060batch: iter_time=5.890e-04, forward_time=0.226, loss_att=49.630, acc=0.956, loss=49.630, backward_time=0.331, grad_norm=98.006, clip=100.000, loss_scale=1.000, optim_step_time=0.148, optim0_lr0=9.548e-04, train_time=3.005 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:06:56,418 (trainer:732) INFO: 34epoch:train:1061-1590batch: iter_time=5.319e-04, forward_time=0.226, loss_att=49.210, acc=0.957, loss=49.210, backward_time=0.332, grad_norm=91.817, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.541e-04, train_time=3.009 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:13:36,081 (trainer:732) INFO: 34epoch:train:1591-2120batch: iter_time=5.327e-04, forward_time=0.226, loss_att=50.381, acc=0.957, loss=50.381, backward_time=0.332, grad_norm=88.144, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.534e-04, train_time=3.015 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:20:16,073 (trainer:732) INFO: 34epoch:train:2121-2650batch: iter_time=5.668e-04, forward_time=0.227, loss_att=50.548, acc=0.956, loss=50.548, backward_time=0.333, grad_norm=89.788, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.527e-04, train_time=3.018 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:26:53,384 (trainer:732) INFO: 34epoch:train:2651-3180batch: iter_time=5.389e-04, forward_time=0.225, loss_att=49.659, acc=0.956, loss=49.659, backward_time=0.330, grad_norm=94.597, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.520e-04, train_time=2.998 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:33:31,085 (trainer:732) INFO: 34epoch:train:3181-3710batch: iter_time=4.446e-04, forward_time=0.225, loss_att=51.051, acc=0.955, loss=51.051, backward_time=0.331, grad_norm=90.520, clip=100.000, loss_scale=1.000, optim_step_time=0.147, optim0_lr0=9.512e-04, train_time=3.003 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:40:07,999 (trainer:732) INFO: 34epoch:train:3711-4240batch: iter_time=5.356e-04, forward_time=0.223, loss_att=50.050, acc=0.956, loss=50.050, backward_time=0.331, grad_norm=99.361, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.505e-04, train_time=2.993 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:46:44,109 (trainer:732) INFO: 34epoch:train:4241-4770batch: iter_time=5.094e-04, forward_time=0.223, loss_att=48.973, acc=0.956, loss=48.973, backward_time=0.331, grad_norm=89.993, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.498e-04, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:53:22,752 (trainer:732) INFO: 34epoch:train:4771-5300batch: iter_time=5.984e-04, forward_time=0.225, loss_att=49.824, acc=0.957, loss=49.824, backward_time=0.333, grad_norm=88.981, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.491e-04, train_time=3.006 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 22:59:59,982 (trainer:732) INFO: 34epoch:train:5301-5830batch: iter_time=5.856e-04, forward_time=0.223, loss_att=49.762, acc=0.956, loss=49.762, backward_time=0.331, grad_norm=93.038, clip=100.000, loss_scale=1.000, optim_step_time=0.148, optim0_lr0=9.484e-04, train_time=2.998 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:06:38,088 (trainer:732) INFO: 34epoch:train:5831-6360batch: iter_time=4.319e-04, forward_time=0.224, loss_att=48.600, acc=0.956, loss=48.600, backward_time=0.331, grad_norm=92.080, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.477e-04, train_time=3.003 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:13:14,161 (trainer:732) INFO: 34epoch:train:6361-6890batch: iter_time=4.563e-04, forward_time=0.223, loss_att=48.872, acc=0.956, loss=48.872, backward_time=0.330, grad_norm=90.217, clip=100.000, loss_scale=1.000, optim_step_time=0.149, optim0_lr0=9.470e-04, train_time=2.990 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:19:54,010 (trainer:732) INFO: 34epoch:train:6891-7420batch: iter_time=5.069e-04, forward_time=0.226, loss_att=49.862, acc=0.956, loss=49.862, backward_time=0.333, grad_norm=91.637, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.463e-04, train_time=3.016 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:26:30,910 (trainer:732) INFO: 34epoch:train:7421-7950batch: iter_time=5.586e-04, forward_time=0.223, loss_att=50.031, acc=0.956, loss=50.031, backward_time=0.332, grad_norm=91.067, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.456e-04, train_time=2.997 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:33:08,045 (trainer:732) INFO: 34epoch:train:7951-8480batch: iter_time=4.609e-04, forward_time=0.223, loss_att=49.873, acc=0.956, loss=49.873, backward_time=0.331, grad_norm=90.041, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.449e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:39:42,786 (trainer:732) INFO: 34epoch:train:8481-9010batch: iter_time=4.083e-04, forward_time=0.221, loss_att=49.995, acc=0.955, loss=49.995, backward_time=0.330, grad_norm=97.860, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.442e-04, train_time=2.980 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:46:20,022 (trainer:732) INFO: 34epoch:train:9011-9540batch: iter_time=4.415e-04, forward_time=0.224, loss_att=50.179, acc=0.956, loss=50.179, backward_time=0.331, grad_norm=87.336, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.435e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:52:57,076 (trainer:732) INFO: 34epoch:train:9541-10070batch: iter_time=5.324e-04, forward_time=0.223, loss_att=50.208, acc=0.956, loss=50.208, backward_time=0.331, grad_norm=91.570, clip=100.000, loss_scale=1.000, optim_step_time=0.146, optim0_lr0=9.428e-04, train_time=2.998 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-23 23:59:34,109 (trainer:732) INFO: 34epoch:train:10071-10600batch: iter_time=5.689e-04, forward_time=0.224, loss_att=49.823, acc=0.956, loss=49.823, backward_time=0.331, grad_norm=85.934, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.421e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:08:38,732 (trainer:338) INFO: 34epoch results: [train] iter_time=6.833e-04, forward_time=0.224, loss_att=49.787, acc=0.956, loss=49.787, backward_time=0.331, grad_norm=91.776, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.488e-04, train_time=3.075, time=2 hours, 16 minutes and 13.67 seconds, total_count=360910, gpu_max_cached_mem_GB=30.221, [valid] loss_att=60.864, acc=0.947, cer=0.066, wer=0.193, loss=60.864, time=4 minutes and 9.76 seconds, total_count=408, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 32.54 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:08:46,146 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:08:46,159 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/24epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:08:46,160 (trainer:272) INFO: 35/60epoch started. Estimated time to finish: 2 days, 12 hours and 56 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:18:49,614 (trainer:732) INFO: 35epoch:train:1-530batch: iter_time=0.002, forward_time=0.231, loss_att=47.984, acc=0.958, loss=47.984, backward_time=0.333, grad_norm=91.783, clip=100.000, loss_scale=1.000, optim_step_time=0.148, optim0_lr0=9.414e-04, train_time=4.561 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:25:31,460 (trainer:732) INFO: 35epoch:train:531-1060batch: iter_time=5.447e-04, forward_time=0.229, loss_att=48.851, acc=0.957, loss=48.851, backward_time=0.333, grad_norm=86.764, clip=100.000, loss_scale=1.000, optim_step_time=0.149, optim0_lr0=9.407e-04, train_time=3.030 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:32:10,113 (trainer:732) INFO: 35epoch:train:1061-1590batch: iter_time=5.192e-04, forward_time=0.226, loss_att=49.735, acc=0.957, loss=49.735, backward_time=0.332, grad_norm=87.551, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.400e-04, train_time=3.009 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:38:47,442 (trainer:732) INFO: 35epoch:train:1591-2120batch: iter_time=5.505e-04, forward_time=0.223, loss_att=48.333, acc=0.957, loss=48.333, backward_time=0.329, grad_norm=87.236, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.393e-04, train_time=2.997 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:45:24,326 (trainer:732) INFO: 35epoch:train:2121-2650batch: iter_time=6.508e-04, forward_time=0.225, loss_att=49.232, acc=0.957, loss=49.232, backward_time=0.330, grad_norm=87.788, clip=100.000, loss_scale=1.000, optim_step_time=0.147, optim0_lr0=9.386e-04, train_time=2.997 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:51:58,694 (trainer:732) INFO: 35epoch:train:2651-3180batch: iter_time=4.526e-04, forward_time=0.222, loss_att=48.265, acc=0.957, loss=48.265, backward_time=0.329, grad_norm=85.849, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.380e-04, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 00:58:34,803 (trainer:732) INFO: 35epoch:train:3181-3710batch: iter_time=4.837e-04, forward_time=0.222, loss_att=49.765, acc=0.957, loss=49.765, backward_time=0.331, grad_norm=91.966, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.373e-04, train_time=2.990 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:05:10,069 (trainer:732) INFO: 35epoch:train:3711-4240batch: iter_time=6.016e-04, forward_time=0.221, loss_att=48.623, acc=0.956, loss=48.623, backward_time=0.328, grad_norm=85.007, clip=100.000, loss_scale=1.000, optim_step_time=0.148, optim0_lr0=9.366e-04, train_time=2.982 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:11:46,681 (trainer:732) INFO: 35epoch:train:4241-4770batch: iter_time=4.415e-04, forward_time=0.223, loss_att=49.723, acc=0.957, loss=49.723, backward_time=0.331, grad_norm=93.917, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.359e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:18:20,902 (trainer:732) INFO: 35epoch:train:4771-5300batch: iter_time=4.668e-04, forward_time=0.222, loss_att=49.522, acc=0.956, loss=49.522, backward_time=0.329, grad_norm=94.824, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.352e-04, train_time=2.973 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:24:56,874 (trainer:732) INFO: 35epoch:train:5301-5830batch: iter_time=4.609e-04, forward_time=0.223, loss_att=48.855, acc=0.957, loss=48.855, backward_time=0.331, grad_norm=88.201, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.346e-04, train_time=2.989 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:31:34,045 (trainer:732) INFO: 35epoch:train:5831-6360batch: iter_time=7.090e-04, forward_time=0.222, loss_att=48.584, acc=0.957, loss=48.584, backward_time=0.332, grad_norm=88.589, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.339e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:38:11,729 (trainer:732) INFO: 35epoch:train:6361-6890batch: iter_time=4.722e-04, forward_time=0.223, loss_att=50.507, acc=0.956, loss=50.507, backward_time=0.332, grad_norm=94.016, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.332e-04, train_time=3.002 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:44:47,438 (trainer:732) INFO: 35epoch:train:6891-7420batch: iter_time=4.540e-04, forward_time=0.221, loss_att=49.769, acc=0.956, loss=49.769, backward_time=0.331, grad_norm=93.270, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.325e-04, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:51:21,594 (trainer:732) INFO: 35epoch:train:7421-7950batch: iter_time=4.732e-04, forward_time=0.221, loss_att=49.671, acc=0.956, loss=49.671, backward_time=0.330, grad_norm=88.117, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.319e-04, train_time=2.975 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 01:57:54,589 (trainer:732) INFO: 35epoch:train:7951-8480batch: iter_time=5.363e-04, forward_time=0.220, loss_att=48.661, acc=0.957, loss=48.661, backward_time=0.329, grad_norm=86.236, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.312e-04, train_time=2.965 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:04:29,203 (trainer:732) INFO: 35epoch:train:8481-9010batch: iter_time=5.109e-04, forward_time=0.222, loss_att=49.707, acc=0.956, loss=49.707, backward_time=0.330, grad_norm=86.048, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.305e-04, train_time=2.979 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:11:07,385 (trainer:732) INFO: 35epoch:train:9011-9540batch: iter_time=4.522e-04, forward_time=0.224, loss_att=49.606, acc=0.957, loss=49.606, backward_time=0.333, grad_norm=94.203, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=9.299e-04, train_time=3.003 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:17:42,898 (trainer:732) INFO: 35epoch:train:9541-10070batch: iter_time=4.879e-04, forward_time=0.222, loss_att=49.039, acc=0.956, loss=49.039, backward_time=0.331, grad_norm=91.062, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.292e-04, train_time=2.985 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:24:19,662 (trainer:732) INFO: 35epoch:train:10071-10600batch: iter_time=4.815e-04, forward_time=0.222, loss_att=48.654, acc=0.958, loss=48.654, backward_time=0.333, grad_norm=88.694, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.285e-04, train_time=2.993 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:33:28,236 (trainer:338) INFO: 35epoch results: [train] iter_time=5.814e-04, forward_time=0.223, loss_att=49.158, acc=0.957, loss=49.158, backward_time=0.331, grad_norm=89.545, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.349e-04, train_time=3.068, time=2 hours, 15 minutes and 54.61 seconds, total_count=371525, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.870, acc=0.949, cer=0.063, wer=0.188, loss=58.870, time=4 minutes and 12.77 seconds, total_count=420, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 34.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:33:35,367 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:33:35,396 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/25epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:33:35,397 (trainer:272) INFO: 36/60epoch started. Estimated time to finish: 2 days, 10 hours and 39 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:43:32,011 (trainer:732) INFO: 36epoch:train:1-530batch: iter_time=0.003, forward_time=0.227, loss_att=46.949, acc=0.958, loss=46.949, backward_time=0.332, grad_norm=86.745, clip=100.000, loss_scale=1.000, optim_step_time=0.148, optim0_lr0=9.279e-04, train_time=4.509 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:50:09,693 (trainer:732) INFO: 36epoch:train:531-1060batch: iter_time=5.499e-04, forward_time=0.227, loss_att=47.491, acc=0.958, loss=47.491, backward_time=0.331, grad_norm=86.187, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.272e-04, train_time=2.999 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 02:56:46,650 (trainer:732) INFO: 36epoch:train:1061-1590batch: iter_time=7.145e-04, forward_time=0.225, loss_att=48.681, acc=0.957, loss=48.681, backward_time=0.331, grad_norm=93.668, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.265e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:03:23,001 (trainer:732) INFO: 36epoch:train:1591-2120batch: iter_time=4.301e-04, forward_time=0.224, loss_att=48.410, acc=0.957, loss=48.410, backward_time=0.331, grad_norm=89.863, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.259e-04, train_time=2.990 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:09:59,135 (trainer:732) INFO: 36epoch:train:2121-2650batch: iter_time=5.264e-04, forward_time=0.223, loss_att=46.923, acc=0.958, loss=46.923, backward_time=0.330, grad_norm=88.332, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.252e-04, train_time=2.991 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:16:34,699 (trainer:732) INFO: 36epoch:train:2651-3180batch: iter_time=5.169e-04, forward_time=0.222, loss_att=47.532, acc=0.957, loss=47.532, backward_time=0.330, grad_norm=84.174, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.246e-04, train_time=2.982 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:23:11,665 (trainer:732) INFO: 36epoch:train:3181-3710batch: iter_time=5.115e-04, forward_time=0.223, loss_att=49.213, acc=0.957, loss=49.213, backward_time=0.331, grad_norm=93.294, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.239e-04, train_time=2.997 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<51907> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<51985> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<47370> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 141) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<34754> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<64614> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<64611> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 142) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<49490> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<49646> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<16623> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<16807> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<62195> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<51375> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<30378> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<30390> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<19048> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<60862> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:29:49,868 (trainer:732) INFO: 36epoch:train:3711-4240batch: iter_time=5.215e-04, forward_time=0.224, loss_att=48.408, acc=0.957, loss=48.408, backward_time=0.332, grad_norm=85.180, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.233e-04, train_time=3.003 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<34053> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<34437> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<31080> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<17898> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<47768> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<47756> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 153) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<36743> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<43128> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 154) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<62896> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<62992> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<58153> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 143) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<44699> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<20362> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<20376> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<54777> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<52916> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 144) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 146) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:36:24,628 (trainer:732) INFO: 36epoch:train:4241-4770batch: iter_time=5.347e-04, forward_time=0.223, loss_att=49.009, acc=0.956, loss=49.009, backward_time=0.330, grad_norm=91.423, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.226e-04, train_time=2.980 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:43:02,539 (trainer:732) INFO: 36epoch:train:4771-5300batch: iter_time=4.927e-04, forward_time=0.223, loss_att=47.953, acc=0.957, loss=47.953, backward_time=0.331, grad_norm=93.823, clip=100.000, loss_scale=1.000, optim_step_time=0.147, optim0_lr0=9.220e-04, train_time=3.001 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:49:38,692 (trainer:732) INFO: 36epoch:train:5301-5830batch: iter_time=4.731e-04, forward_time=0.222, loss_att=48.255, acc=0.958, loss=48.255, backward_time=0.332, grad_norm=92.362, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.213e-04, train_time=2.990 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 03:56:15,643 (trainer:732) INFO: 36epoch:train:5831-6360batch: iter_time=5.377e-04, forward_time=0.223, loss_att=49.211, acc=0.957, loss=49.211, backward_time=0.331, grad_norm=89.386, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.207e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:02:52,364 (trainer:732) INFO: 36epoch:train:6361-6890batch: iter_time=5.151e-04, forward_time=0.223, loss_att=49.064, acc=0.957, loss=49.064, backward_time=0.331, grad_norm=87.849, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.200e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:09:28,450 (trainer:732) INFO: 36epoch:train:6891-7420batch: iter_time=5.214e-04, forward_time=0.223, loss_att=47.944, acc=0.957, loss=47.944, backward_time=0.330, grad_norm=85.779, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.194e-04, train_time=2.987 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:16:08,463 (trainer:732) INFO: 36epoch:train:7421-7950batch: iter_time=5.573e-04, forward_time=0.224, loss_att=49.064, acc=0.957, loss=49.064, backward_time=0.334, grad_norm=88.487, clip=100.000, loss_scale=1.000, optim_step_time=0.149, optim0_lr0=9.187e-04, train_time=3.020 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:22:46,619 (trainer:732) INFO: 36epoch:train:7951-8480batch: iter_time=4.429e-04, forward_time=0.225, loss_att=50.679, acc=0.956, loss=50.679, backward_time=0.333, grad_norm=91.584, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.181e-04, train_time=3.002 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:29:24,706 (trainer:732) INFO: 36epoch:train:8481-9010batch: iter_time=5.127e-04, forward_time=0.225, loss_att=48.149, acc=0.958, loss=48.149, backward_time=0.332, grad_norm=91.150, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.175e-04, train_time=3.005 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:36:02,944 (trainer:732) INFO: 36epoch:train:9011-9540batch: iter_time=5.672e-04, forward_time=0.224, loss_att=49.132, acc=0.957, loss=49.132, backward_time=0.332, grad_norm=85.875, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.168e-04, train_time=3.004 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:42:38,136 (trainer:732) INFO: 36epoch:train:9541-10070batch: iter_time=6.744e-04, forward_time=0.222, loss_att=49.666, acc=0.956, loss=49.666, backward_time=0.329, grad_norm=87.883, clip=100.000, loss_scale=1.000, optim_step_time=0.150, optim0_lr0=9.162e-04, train_time=2.984 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:49:16,124 (trainer:732) INFO: 36epoch:train:10071-10600batch: iter_time=5.670e-04, forward_time=0.224, loss_att=49.056, acc=0.957, loss=49.056, backward_time=0.333, grad_norm=92.555, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.155e-04, train_time=3.001 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:58:18,671 (trainer:338) INFO: 36epoch results: [train] iter_time=6.406e-04, forward_time=0.224, loss_att=48.543, acc=0.957, loss=48.543, backward_time=0.331, grad_norm=89.276, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.217e-04, train_time=3.071, time=2 hours, 16 minutes and 1.68 seconds, total_count=382140, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.970, acc=0.948, cer=0.066, wer=0.191, loss=59.970, time=4 minutes and 11.51 seconds, total_count=432, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 30.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:58:26,862 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:58:26,875 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/30epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 04:58:26,875 (trainer:272) INFO: 37/60epoch started. Estimated time to finish: 2 days, 8 hours and 22 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:08:23,824 (trainer:732) INFO: 37epoch:train:1-530batch: iter_time=0.004, forward_time=0.227, loss_att=47.564, acc=0.959, loss=47.564, backward_time=0.332, grad_norm=86.877, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.149e-04, train_time=4.511 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:15:02,326 (trainer:732) INFO: 37epoch:train:531-1060batch: iter_time=5.176e-04, forward_time=0.225, loss_att=47.329, acc=0.958, loss=47.329, backward_time=0.332, grad_norm=92.360, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.143e-04, train_time=3.006 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:21:41,301 (trainer:732) INFO: 37epoch:train:1061-1590batch: iter_time=5.239e-04, forward_time=0.226, loss_att=48.670, acc=0.957, loss=48.670, backward_time=0.333, grad_norm=93.305, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.136e-04, train_time=3.012 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:28:20,429 (trainer:732) INFO: 37epoch:train:1591-2120batch: iter_time=6.835e-04, forward_time=0.225, loss_att=47.100, acc=0.958, loss=47.100, backward_time=0.332, grad_norm=91.839, clip=100.000, loss_scale=1.000, optim_step_time=0.147, optim0_lr0=9.130e-04, train_time=3.010 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:34:57,076 (trainer:732) INFO: 37epoch:train:2121-2650batch: iter_time=5.576e-04, forward_time=0.224, loss_att=48.679, acc=0.957, loss=48.679, backward_time=0.331, grad_norm=85.582, clip=100.000, loss_scale=1.000, optim_step_time=0.140, optim0_lr0=9.124e-04, train_time=2.994 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:41:36,234 (trainer:732) INFO: 37epoch:train:2651-3180batch: iter_time=5.670e-04, forward_time=0.224, loss_att=48.789, acc=0.958, loss=48.789, backward_time=0.333, grad_norm=84.814, clip=100.000, loss_scale=1.000, optim_step_time=0.149, optim0_lr0=9.117e-04, train_time=3.011 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:48:13,464 (trainer:732) INFO: 37epoch:train:3181-3710batch: iter_time=4.964e-04, forward_time=0.223, loss_att=47.611, acc=0.958, loss=47.611, backward_time=0.332, grad_norm=92.481, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.111e-04, train_time=2.999 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 05:54:50,678 (trainer:732) INFO: 37epoch:train:3711-4240batch: iter_time=5.183e-04, forward_time=0.222, loss_att=46.274, acc=0.958, loss=46.274, backward_time=0.332, grad_norm=98.921, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.105e-04, train_time=2.995 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:01:27,574 (trainer:732) INFO: 37epoch:train:4241-4770batch: iter_time=6.050e-04, forward_time=0.222, loss_att=47.379, acc=0.958, loss=47.379, backward_time=0.333, grad_norm=89.380, clip=100.000, loss_scale=1.000, optim_step_time=0.145, optim0_lr0=9.099e-04, train_time=2.996 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:08:04,227 (trainer:732) INFO: 37epoch:train:4771-5300batch: iter_time=4.960e-04, forward_time=0.222, loss_att=48.623, acc=0.958, loss=48.623, backward_time=0.331, grad_norm=89.875, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.092e-04, train_time=2.992 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:14:40,774 (trainer:732) INFO: 37epoch:train:5301-5830batch: iter_time=4.796e-04, forward_time=0.223, loss_att=48.203, acc=0.958, loss=48.203, backward_time=0.332, grad_norm=91.165, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.086e-04, train_time=2.993 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:21:15,768 (trainer:732) INFO: 37epoch:train:5831-6360batch: iter_time=5.259e-04, forward_time=0.221, loss_att=47.477, acc=0.957, loss=47.477, backward_time=0.330, grad_norm=86.325, clip=100.000, loss_scale=1.000, optim_step_time=0.142, optim0_lr0=9.080e-04, train_time=2.980 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:27:50,774 (trainer:732) INFO: 37epoch:train:6361-6890batch: iter_time=4.590e-04, forward_time=0.221, loss_att=47.493, acc=0.958, loss=47.493, backward_time=0.332, grad_norm=84.876, clip=100.000, loss_scale=1.000, optim_step_time=0.137, optim0_lr0=9.074e-04, train_time=2.981 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:34:24,424 (trainer:732) INFO: 37epoch:train:6891-7420batch: iter_time=4.951e-04, forward_time=0.220, loss_att=47.791, acc=0.958, loss=47.791, backward_time=0.330, grad_norm=89.074, clip=100.000, loss_scale=1.000, optim_step_time=0.135, optim0_lr0=9.068e-04, train_time=2.970 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:40:59,045 (trainer:732) INFO: 37epoch:train:7421-7950batch: iter_time=4.597e-04, forward_time=0.221, loss_att=48.082, acc=0.957, loss=48.082, backward_time=0.331, grad_norm=88.753, clip=100.000, loss_scale=1.000, optim_step_time=0.144, optim0_lr0=9.061e-04, train_time=2.979 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:47:33,620 (trainer:732) INFO: 37epoch:train:7951-8480batch: iter_time=4.922e-04, forward_time=0.221, loss_att=48.353, acc=0.957, loss=48.353, backward_time=0.331, grad_norm=89.521, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.055e-04, train_time=2.976 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 06:54:07,571 (trainer:732) INFO: 37epoch:train:8481-9010batch: iter_time=5.867e-04, forward_time=0.220, loss_att=48.336, acc=0.957, loss=48.336, backward_time=0.330, grad_norm=86.626, clip=100.000, loss_scale=1.000, optim_step_time=0.139, optim0_lr0=9.049e-04, train_time=2.974 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:00:42,369 (trainer:732) INFO: 37epoch:train:9011-9540batch: iter_time=5.086e-04, forward_time=0.221, loss_att=48.309, acc=0.957, loss=48.309, backward_time=0.330, grad_norm=91.119, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.043e-04, train_time=2.978 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:07:17,219 (trainer:732) INFO: 37epoch:train:9541-10070batch: iter_time=4.682e-04, forward_time=0.221, loss_att=48.536, acc=0.957, loss=48.536, backward_time=0.329, grad_norm=87.702, clip=100.000, loss_scale=1.000, optim_step_time=0.143, optim0_lr0=9.037e-04, train_time=2.981 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:13:52,044 (trainer:732) INFO: 37epoch:train:10071-10600batch: iter_time=5.802e-04, forward_time=0.221, loss_att=48.382, acc=0.957, loss=48.382, backward_time=0.331, grad_norm=88.695, clip=100.000, loss_scale=1.000, optim_step_time=0.138, optim0_lr0=9.031e-04, train_time=2.978 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:23:17,538 (trainer:338) INFO: 37epoch results: [train] iter_time=6.915e-04, forward_time=0.222, loss_att=47.946, acc=0.958, loss=47.946, backward_time=0.331, grad_norm=89.483, clip=100.000, loss_scale=1.000, optim_step_time=0.141, optim0_lr0=9.089e-04, train_time=3.065, time=2 hours, 15 minutes and 45.25 seconds, total_count=392755, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.917, acc=0.949, cer=0.063, wer=0.187, loss=58.917, time=4 minutes and 0.67 seconds, total_count=444, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 4.73 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:23:29,318 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:23:29,331 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/26epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:23:29,332 (trainer:272) INFO: 38/60epoch started. Estimated time to finish: 2 days, 6 hours and 4 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:34:01,810 (trainer:732) INFO: 38epoch:train:1-530batch: iter_time=0.006, forward_time=0.241, loss_att=46.146, acc=0.960, loss=46.146, backward_time=0.341, grad_norm=83.167, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=9.025e-04, train_time=4.779 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:40:57,711 (trainer:732) INFO: 38epoch:train:531-1060batch: iter_time=0.001, forward_time=0.240, loss_att=45.802, acc=0.959, loss=45.802, backward_time=0.339, grad_norm=89.114, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=9.018e-04, train_time=3.136 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:47:51,901 (trainer:732) INFO: 38epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.239, loss_att=46.706, acc=0.958, loss=46.706, backward_time=0.339, grad_norm=89.122, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=9.012e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 07:54:45,835 (trainer:732) INFO: 38epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.237, loss_att=46.667, acc=0.958, loss=46.667, backward_time=0.335, grad_norm=92.791, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=9.006e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:01:37,653 (trainer:732) INFO: 38epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=47.198, acc=0.958, loss=47.198, backward_time=0.336, grad_norm=87.819, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=9.000e-04, train_time=3.109 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:08:30,783 (trainer:732) INFO: 38epoch:train:2651-3180batch: iter_time=9.087e-04, forward_time=0.237, loss_att=46.839, acc=0.959, loss=46.839, backward_time=0.336, grad_norm=84.616, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.994e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:15:22,228 (trainer:732) INFO: 38epoch:train:3181-3710batch: iter_time=8.674e-04, forward_time=0.235, loss_att=47.789, acc=0.958, loss=47.789, backward_time=0.336, grad_norm=91.702, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.988e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:22:11,636 (trainer:732) INFO: 38epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.235, loss_att=48.106, acc=0.957, loss=48.106, backward_time=0.335, grad_norm=86.493, clip=100.000, loss_scale=1.000, optim_step_time=0.184, optim0_lr0=8.982e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:29:03,891 (trainer:732) INFO: 38epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.237, loss_att=47.697, acc=0.958, loss=47.697, backward_time=0.337, grad_norm=87.129, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.976e-04, train_time=3.112 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:35:55,640 (trainer:732) INFO: 38epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.236, loss_att=47.612, acc=0.958, loss=47.612, backward_time=0.336, grad_norm=84.699, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.970e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:42:45,567 (trainer:732) INFO: 38epoch:train:5301-5830batch: iter_time=9.989e-04, forward_time=0.235, loss_att=47.115, acc=0.959, loss=47.115, backward_time=0.335, grad_norm=86.954, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.964e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:49:35,498 (trainer:732) INFO: 38epoch:train:5831-6360batch: iter_time=9.161e-04, forward_time=0.234, loss_att=47.231, acc=0.957, loss=47.231, backward_time=0.333, grad_norm=88.113, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.958e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 08:56:26,245 (trainer:732) INFO: 38epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.234, loss_att=46.905, acc=0.958, loss=46.905, backward_time=0.336, grad_norm=90.225, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.952e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:03:20,354 (trainer:732) INFO: 38epoch:train:6891-7420batch: iter_time=9.406e-04, forward_time=0.236, loss_att=48.113, acc=0.958, loss=48.113, backward_time=0.337, grad_norm=91.111, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=8.946e-04, train_time=3.122 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:10:09,463 (trainer:732) INFO: 38epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.234, loss_att=47.393, acc=0.958, loss=47.393, backward_time=0.336, grad_norm=90.797, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=8.941e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:17:01,771 (trainer:732) INFO: 38epoch:train:7951-8480batch: iter_time=8.352e-04, forward_time=0.235, loss_att=48.152, acc=0.958, loss=48.152, backward_time=0.336, grad_norm=89.474, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.935e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:23:52,632 (trainer:732) INFO: 38epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.234, loss_att=49.046, acc=0.958, loss=49.046, backward_time=0.336, grad_norm=90.398, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.929e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:30:41,546 (trainer:732) INFO: 38epoch:train:9011-9540batch: iter_time=8.305e-04, forward_time=0.232, loss_att=46.901, acc=0.958, loss=46.901, backward_time=0.332, grad_norm=84.567, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=8.923e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:37:29,151 (trainer:732) INFO: 38epoch:train:9541-10070batch: iter_time=9.375e-04, forward_time=0.232, loss_att=48.210, acc=0.957, loss=48.210, backward_time=0.334, grad_norm=87.898, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.917e-04, train_time=3.077 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:44:19,275 (trainer:732) INFO: 38epoch:train:10071-10600batch: iter_time=8.965e-04, forward_time=0.234, loss_att=47.874, acc=0.957, loss=47.874, backward_time=0.334, grad_norm=88.461, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.911e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:53:41,951 (trainer:338) INFO: 38epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=47.371, acc=0.958, loss=47.371, backward_time=0.336, grad_norm=88.234, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.967e-04, train_time=3.188, time=2 hours, 21 minutes and 10.47 seconds, total_count=403370, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.001, acc=0.949, cer=0.064, wer=0.191, loss=59.001, time=3 minutes and 54.61 seconds, total_count=456, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 7.53 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:53:52,693 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:53:52,706 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/27epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 09:53:52,706 (trainer:272) INFO: 39/60epoch started. Estimated time to finish: 2 days, 3 hours and 50 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:04:27,060 (trainer:732) INFO: 39epoch:train:1-530batch: iter_time=0.003, forward_time=0.240, loss_att=46.248, acc=0.959, loss=46.248, backward_time=0.339, grad_norm=84.042, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.905e-04, train_time=4.794 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:11:21,866 (trainer:732) INFO: 39epoch:train:531-1060batch: iter_time=0.001, forward_time=0.240, loss_att=46.841, acc=0.959, loss=46.841, backward_time=0.340, grad_norm=88.660, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.899e-04, train_time=3.128 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:18:15,370 (trainer:732) INFO: 39epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.237, loss_att=46.915, acc=0.959, loss=46.915, backward_time=0.337, grad_norm=91.479, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.893e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:25:06,932 (trainer:732) INFO: 39epoch:train:1591-2120batch: iter_time=9.968e-04, forward_time=0.237, loss_att=46.337, acc=0.958, loss=46.337, backward_time=0.334, grad_norm=91.017, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.888e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:31:56,806 (trainer:732) INFO: 39epoch:train:2121-2650batch: iter_time=9.923e-04, forward_time=0.235, loss_att=45.208, acc=0.959, loss=45.208, backward_time=0.335, grad_norm=84.850, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.882e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:38:48,948 (trainer:732) INFO: 39epoch:train:2651-3180batch: iter_time=8.633e-04, forward_time=0.236, loss_att=47.118, acc=0.959, loss=47.118, backward_time=0.336, grad_norm=88.884, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.876e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:45:38,105 (trainer:732) INFO: 39epoch:train:3181-3710batch: iter_time=9.750e-04, forward_time=0.235, loss_att=46.264, acc=0.959, loss=46.264, backward_time=0.335, grad_norm=90.099, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.870e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:52:28,147 (trainer:732) INFO: 39epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.233, loss_att=46.968, acc=0.959, loss=46.968, backward_time=0.335, grad_norm=89.185, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.864e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 10:59:19,781 (trainer:732) INFO: 39epoch:train:4241-4770batch: iter_time=7.665e-04, forward_time=0.236, loss_att=47.436, acc=0.958, loss=47.436, backward_time=0.336, grad_norm=82.486, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.859e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:06:17,433 (trainer:732) INFO: 39epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.238, loss_att=47.192, acc=0.960, loss=47.192, backward_time=0.340, grad_norm=95.855, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=8.853e-04, train_time=3.149 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:13:11,036 (trainer:732) INFO: 39epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.237, loss_att=47.573, acc=0.959, loss=47.573, backward_time=0.337, grad_norm=89.553, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.847e-04, train_time=3.123 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:20:03,892 (trainer:732) INFO: 39epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.238, loss_att=46.828, acc=0.959, loss=46.828, backward_time=0.336, grad_norm=96.242, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.841e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:27:02,691 (trainer:732) INFO: 39epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.242, loss_att=46.963, acc=0.958, loss=46.963, backward_time=0.340, grad_norm=89.082, clip=100.000, loss_scale=1.000, optim_step_time=0.210, optim0_lr0=8.836e-04, train_time=3.161 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:33:57,435 (trainer:732) INFO: 39epoch:train:6891-7420batch: iter_time=9.924e-04, forward_time=0.237, loss_att=46.861, acc=0.959, loss=46.861, backward_time=0.338, grad_norm=85.994, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.830e-04, train_time=3.128 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:40:48,518 (trainer:732) INFO: 39epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.237, loss_att=47.160, acc=0.958, loss=47.160, backward_time=0.334, grad_norm=99.494, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.824e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:47:40,399 (trainer:732) INFO: 39epoch:train:7951-8480batch: iter_time=9.958e-04, forward_time=0.236, loss_att=47.073, acc=0.958, loss=47.073, backward_time=0.336, grad_norm=86.265, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.819e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 11:54:34,125 (trainer:732) INFO: 39epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.238, loss_att=47.395, acc=0.959, loss=47.395, backward_time=0.337, grad_norm=103.323, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.813e-04, train_time=3.123 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:01:25,297 (trainer:732) INFO: 39epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.237, loss_att=47.767, acc=0.957, loss=47.767, backward_time=0.335, grad_norm=89.831, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.807e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:08:19,328 (trainer:732) INFO: 39epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.237, loss_att=46.098, acc=0.959, loss=46.098, backward_time=0.337, grad_norm=87.461, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.802e-04, train_time=3.126 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:15:10,668 (trainer:732) INFO: 39epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.237, loss_att=46.213, acc=0.959, loss=46.213, backward_time=0.335, grad_norm=88.023, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.796e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:24:22,896 (trainer:338) INFO: 39epoch results: [train] iter_time=0.001, forward_time=0.237, loss_att=46.834, acc=0.959, loss=46.834, backward_time=0.337, grad_norm=90.100, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.850e-04, train_time=3.198, time=2 hours, 21 minutes and 38.73 seconds, total_count=413985, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.987, acc=0.949, cer=0.064, wer=0.189, loss=58.987, time=3 minutes and 50.64 seconds, total_count=468, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 0.82 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:24:34,350 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:24:34,364 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/28epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:24:34,365 (trainer:272) INFO: 40/60epoch started. Estimated time to finish: 2 days, 1 hour and 35 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:35:01,683 (trainer:732) INFO: 40epoch:train:1-530batch: iter_time=0.006, forward_time=0.240, loss_att=45.289, acc=0.959, loss=45.289, backward_time=0.338, grad_norm=88.954, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.790e-04, train_time=4.740 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:41:56,791 (trainer:732) INFO: 40epoch:train:531-1060batch: iter_time=0.001, forward_time=0.239, loss_att=45.950, acc=0.960, loss=45.950, backward_time=0.338, grad_norm=88.293, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.785e-04, train_time=3.130 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:48:52,546 (trainer:732) INFO: 40epoch:train:1061-1590batch: iter_time=9.707e-04, forward_time=0.241, loss_att=46.263, acc=0.960, loss=46.263, backward_time=0.338, grad_norm=94.370, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.779e-04, train_time=3.138 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 12:55:46,391 (trainer:732) INFO: 40epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.238, loss_att=43.821, acc=0.960, loss=43.821, backward_time=0.338, grad_norm=98.952, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.773e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:02:38,769 (trainer:732) INFO: 40epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.238, loss_att=46.779, acc=0.958, loss=46.779, backward_time=0.337, grad_norm=91.574, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.768e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:09:32,801 (trainer:732) INFO: 40epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.238, loss_att=45.452, acc=0.960, loss=45.452, backward_time=0.338, grad_norm=83.293, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.762e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:16:25,472 (trainer:732) INFO: 40epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.237, loss_att=45.785, acc=0.959, loss=45.785, backward_time=0.337, grad_norm=93.533, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.757e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:23:18,820 (trainer:732) INFO: 40epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.237, loss_att=44.985, acc=0.960, loss=44.985, backward_time=0.337, grad_norm=94.375, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.751e-04, train_time=3.119 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:30:11,623 (trainer:732) INFO: 40epoch:train:4241-4770batch: iter_time=9.735e-04, forward_time=0.237, loss_att=46.061, acc=0.959, loss=46.061, backward_time=0.336, grad_norm=86.237, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=8.746e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:37:02,472 (trainer:732) INFO: 40epoch:train:4771-5300batch: iter_time=9.808e-04, forward_time=0.235, loss_att=47.023, acc=0.958, loss=47.023, backward_time=0.335, grad_norm=98.162, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.740e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:43:53,091 (trainer:732) INFO: 40epoch:train:5301-5830batch: iter_time=8.522e-04, forward_time=0.234, loss_att=46.763, acc=0.959, loss=46.763, backward_time=0.334, grad_norm=90.746, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.735e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:50:47,073 (trainer:732) INFO: 40epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.236, loss_att=46.328, acc=0.959, loss=46.328, backward_time=0.336, grad_norm=89.741, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=8.729e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 13:57:40,845 (trainer:732) INFO: 40epoch:train:6361-6890batch: iter_time=9.893e-04, forward_time=0.236, loss_att=47.351, acc=0.959, loss=47.351, backward_time=0.338, grad_norm=85.812, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.724e-04, train_time=3.124 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:04:32,519 (trainer:732) INFO: 40epoch:train:6891-7420batch: iter_time=8.495e-04, forward_time=0.236, loss_att=47.488, acc=0.958, loss=47.488, backward_time=0.336, grad_norm=91.970, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.718e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:11:27,489 (trainer:732) INFO: 40epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.238, loss_att=45.823, acc=0.960, loss=45.823, backward_time=0.338, grad_norm=87.730, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.713e-04, train_time=3.132 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:18:21,313 (trainer:732) INFO: 40epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.236, loss_att=46.579, acc=0.959, loss=46.579, backward_time=0.337, grad_norm=89.080, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.707e-04, train_time=3.121 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:25:13,066 (trainer:732) INFO: 40epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.235, loss_att=46.020, acc=0.959, loss=46.020, backward_time=0.334, grad_norm=86.416, clip=100.000, loss_scale=1.000, optim_step_time=0.205, optim0_lr0=8.702e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:32:03,419 (trainer:732) INFO: 40epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.233, loss_att=47.096, acc=0.958, loss=47.096, backward_time=0.335, grad_norm=88.228, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.696e-04, train_time=3.095 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:38:53,699 (trainer:732) INFO: 40epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.234, loss_att=47.093, acc=0.959, loss=47.093, backward_time=0.335, grad_norm=96.881, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.691e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:45:44,407 (trainer:732) INFO: 40epoch:train:10071-10600batch: iter_time=9.098e-04, forward_time=0.233, loss_att=47.140, acc=0.959, loss=47.140, backward_time=0.336, grad_norm=86.922, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.685e-04, train_time=3.098 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 110) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 110) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:55:03,789 (trainer:338) INFO: 40epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=46.258, acc=0.959, loss=46.258, backward_time=0.337, grad_norm=90.696, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.737e-04, train_time=3.195, time=2 hours, 21 minutes and 29.48 seconds, total_count=424600, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.318, acc=0.949, cer=0.065, wer=0.190, loss=59.318, time=3 minutes and 53.97 seconds, total_count=480, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 5.96 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:55:14,456 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:55:14,483 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/31epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 14:55:14,484 (trainer:272) INFO: 41/60epoch started. Estimated time to finish: 1 day, 23 hours and 19 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:05:47,129 (trainer:732) INFO: 41epoch:train:1-530batch: iter_time=0.003, forward_time=0.239, loss_att=44.653, acc=0.960, loss=44.653, backward_time=0.337, grad_norm=87.351, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.680e-04, train_time=4.781 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:12:42,824 (trainer:732) INFO: 41epoch:train:531-1060batch: iter_time=0.001, forward_time=0.240, loss_att=45.412, acc=0.960, loss=45.412, backward_time=0.339, grad_norm=91.524, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=8.674e-04, train_time=3.135 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:19:33,289 (trainer:732) INFO: 41epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.236, loss_att=46.217, acc=0.959, loss=46.217, backward_time=0.337, grad_norm=85.535, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=8.669e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:26:28,195 (trainer:732) INFO: 41epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.239, loss_att=45.678, acc=0.960, loss=45.678, backward_time=0.337, grad_norm=82.815, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.664e-04, train_time=3.129 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:33:18,275 (trainer:732) INFO: 41epoch:train:2121-2650batch: iter_time=9.142e-04, forward_time=0.235, loss_att=45.203, acc=0.960, loss=45.203, backward_time=0.336, grad_norm=87.487, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.658e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:40:11,256 (trainer:732) INFO: 41epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.236, loss_att=45.249, acc=0.960, loss=45.249, backward_time=0.337, grad_norm=86.758, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.653e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:47:03,288 (trainer:732) INFO: 41epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.237, loss_att=45.322, acc=0.959, loss=45.322, backward_time=0.336, grad_norm=86.453, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.647e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 15:53:53,688 (trainer:732) INFO: 41epoch:train:3711-4240batch: iter_time=8.011e-04, forward_time=0.235, loss_att=46.017, acc=0.959, loss=46.017, backward_time=0.335, grad_norm=93.040, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.642e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:00:48,850 (trainer:732) INFO: 41epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.239, loss_att=45.718, acc=0.960, loss=45.718, backward_time=0.338, grad_norm=113.941, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.637e-04, train_time=3.135 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:07:45,410 (trainer:732) INFO: 41epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.239, loss_att=46.320, acc=0.959, loss=46.320, backward_time=0.339, grad_norm=90.722, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=8.631e-04, train_time=3.141 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:14:39,251 (trainer:732) INFO: 41epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.238, loss_att=45.744, acc=0.960, loss=45.744, backward_time=0.337, grad_norm=90.816, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.626e-04, train_time=3.124 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:21:31,953 (trainer:732) INFO: 41epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.237, loss_att=46.232, acc=0.959, loss=46.232, backward_time=0.337, grad_norm=91.820, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=8.621e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:28:21,206 (trainer:732) INFO: 41epoch:train:6361-6890batch: iter_time=8.763e-04, forward_time=0.235, loss_att=45.525, acc=0.959, loss=45.525, backward_time=0.335, grad_norm=93.911, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.616e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:35:11,785 (trainer:732) INFO: 41epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.236, loss_att=45.514, acc=0.959, loss=45.514, backward_time=0.335, grad_norm=84.017, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.610e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:42:04,657 (trainer:732) INFO: 41epoch:train:7421-7950batch: iter_time=9.227e-04, forward_time=0.237, loss_att=45.964, acc=0.959, loss=45.964, backward_time=0.337, grad_norm=91.022, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.605e-04, train_time=3.117 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:48:55,991 (trainer:732) INFO: 41epoch:train:7951-8480batch: iter_time=9.393e-04, forward_time=0.237, loss_att=45.612, acc=0.959, loss=45.612, backward_time=0.335, grad_norm=85.379, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.600e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 16:55:47,082 (trainer:732) INFO: 41epoch:train:8481-9010batch: iter_time=9.364e-04, forward_time=0.235, loss_att=45.846, acc=0.960, loss=45.846, backward_time=0.337, grad_norm=88.053, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.594e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:02:36,310 (trainer:732) INFO: 41epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.234, loss_att=45.893, acc=0.959, loss=45.893, backward_time=0.334, grad_norm=94.432, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=8.589e-04, train_time=3.086 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<46420> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:09:28,643 (trainer:732) INFO: 41epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.235, loss_att=45.932, acc=0.960, loss=45.932, backward_time=0.336, grad_norm=85.701, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.584e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:16:17,215 (trainer:732) INFO: 41epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.234, loss_att=46.519, acc=0.959, loss=46.519, backward_time=0.333, grad_norm=89.622, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.579e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:25:32,149 (trainer:338) INFO: 41epoch results: [train] iter_time=0.001, forward_time=0.237, loss_att=45.732, acc=0.960, loss=45.732, backward_time=0.336, grad_norm=90.022, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.629e-04, train_time=3.192, time=2 hours, 21 minutes and 23.6 seconds, total_count=435215, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.641, acc=0.950, cer=0.063, wer=0.186, loss=58.641, time=3 minutes and 51.29 seconds, total_count=492, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 2.76 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:25:42,612 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:25:42,624 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/34epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:25:42,625 (trainer:272) INFO: 42/60epoch started. Estimated time to finish: 1 day, 21 hours and 2 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:36:13,876 (trainer:732) INFO: 42epoch:train:1-530batch: iter_time=0.003, forward_time=0.238, loss_att=43.991, acc=0.961, loss=43.991, backward_time=0.337, grad_norm=88.152, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.573e-04, train_time=4.771 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:43:05,103 (trainer:732) INFO: 42epoch:train:531-1060batch: iter_time=0.001, forward_time=0.236, loss_att=44.688, acc=0.960, loss=44.688, backward_time=0.336, grad_norm=86.652, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=8.568e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:49:57,544 (trainer:732) INFO: 42epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.237, loss_att=45.616, acc=0.960, loss=45.616, backward_time=0.336, grad_norm=94.464, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.563e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 17:56:47,654 (trainer:732) INFO: 42epoch:train:1591-2120batch: iter_time=9.652e-04, forward_time=0.235, loss_att=45.325, acc=0.960, loss=45.325, backward_time=0.334, grad_norm=86.974, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.558e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:03:39,751 (trainer:732) INFO: 42epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=45.031, acc=0.961, loss=45.031, backward_time=0.337, grad_norm=89.591, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=8.553e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:10:31,159 (trainer:732) INFO: 42epoch:train:2651-3180batch: iter_time=7.989e-04, forward_time=0.237, loss_att=45.277, acc=0.960, loss=45.277, backward_time=0.337, grad_norm=94.035, clip=100.000, loss_scale=1.000, optim_step_time=0.183, optim0_lr0=8.547e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:17:23,787 (trainer:732) INFO: 42epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.237, loss_att=45.391, acc=0.960, loss=45.391, backward_time=0.337, grad_norm=91.341, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.542e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:24:16,602 (trainer:732) INFO: 42epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.238, loss_att=45.123, acc=0.960, loss=45.123, backward_time=0.337, grad_norm=89.529, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.537e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:31:06,739 (trainer:732) INFO: 42epoch:train:4241-4770batch: iter_time=9.757e-04, forward_time=0.235, loss_att=44.791, acc=0.961, loss=44.791, backward_time=0.335, grad_norm=89.815, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.532e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:37:57,879 (trainer:732) INFO: 42epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.234, loss_att=46.303, acc=0.960, loss=46.303, backward_time=0.336, grad_norm=89.815, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.527e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:44:47,893 (trainer:732) INFO: 42epoch:train:5301-5830batch: iter_time=9.860e-04, forward_time=0.235, loss_att=45.326, acc=0.960, loss=45.326, backward_time=0.335, grad_norm=84.960, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.522e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:51:37,225 (trainer:732) INFO: 42epoch:train:5831-6360batch: iter_time=8.424e-04, forward_time=0.234, loss_att=45.712, acc=0.959, loss=45.712, backward_time=0.333, grad_norm=80.283, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.517e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 18:58:27,019 (trainer:732) INFO: 42epoch:train:6361-6890batch: iter_time=8.654e-04, forward_time=0.234, loss_att=45.721, acc=0.961, loss=45.721, backward_time=0.335, grad_norm=88.877, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.511e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:05:16,240 (trainer:732) INFO: 42epoch:train:6891-7420batch: iter_time=8.494e-04, forward_time=0.234, loss_att=44.881, acc=0.960, loss=44.881, backward_time=0.333, grad_norm=87.075, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.506e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:12:07,772 (trainer:732) INFO: 42epoch:train:7421-7950batch: iter_time=9.022e-04, forward_time=0.236, loss_att=45.921, acc=0.960, loss=45.921, backward_time=0.336, grad_norm=87.374, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.501e-04, train_time=3.107 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 155) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 155) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:18:58,287 (trainer:732) INFO: 42epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.235, loss_att=45.052, acc=0.959, loss=45.052, backward_time=0.334, grad_norm=83.860, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.496e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:25:47,647 (trainer:732) INFO: 42epoch:train:8481-9010batch: iter_time=9.743e-04, forward_time=0.235, loss_att=44.869, acc=0.960, loss=44.869, backward_time=0.334, grad_norm=87.861, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.491e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:32:40,665 (trainer:732) INFO: 42epoch:train:9011-9540batch: iter_time=9.859e-04, forward_time=0.236, loss_att=45.617, acc=0.959, loss=45.617, backward_time=0.336, grad_norm=90.087, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=8.486e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:39:31,230 (trainer:732) INFO: 42epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.236, loss_att=46.517, acc=0.960, loss=46.517, backward_time=0.335, grad_norm=93.014, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.481e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:46:20,651 (trainer:732) INFO: 42epoch:train:10071-10600batch: iter_time=9.691e-04, forward_time=0.234, loss_att=45.793, acc=0.959, loss=45.793, backward_time=0.334, grad_norm=85.689, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.476e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:55:13,892 (trainer:338) INFO: 42epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=45.340, acc=0.960, loss=45.340, backward_time=0.335, grad_norm=88.471, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.524e-04, train_time=3.183, time=2 hours, 20 minutes and 59.15 seconds, total_count=445830, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.652, acc=0.949, cer=0.065, wer=0.189, loss=59.652, time=3 minutes and 33.13 seconds, total_count=504, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 58.98 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:55:25,144 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:55:25,157 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/32epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 19:55:25,158 (trainer:272) INFO: 43/60epoch started. Estimated time to finish: 1 day, 18 hours and 43 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:06:03,671 (trainer:732) INFO: 43epoch:train:1-530batch: iter_time=0.003, forward_time=0.241, loss_att=43.623, acc=0.961, loss=43.623, backward_time=0.340, grad_norm=89.647, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.471e-04, train_time=4.825 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:12:55,239 (trainer:732) INFO: 43epoch:train:531-1060batch: iter_time=9.100e-04, forward_time=0.237, loss_att=43.421, acc=0.961, loss=43.421, backward_time=0.336, grad_norm=84.237, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.466e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:19:51,285 (trainer:732) INFO: 43epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.239, loss_att=44.320, acc=0.961, loss=44.320, backward_time=0.338, grad_norm=87.247, clip=100.000, loss_scale=1.000, optim_step_time=0.207, optim0_lr0=8.461e-04, train_time=3.140 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:26:47,539 (trainer:732) INFO: 43epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.240, loss_att=44.288, acc=0.961, loss=44.288, backward_time=0.338, grad_norm=82.557, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=8.456e-04, train_time=3.140 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:33:42,295 (trainer:732) INFO: 43epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.238, loss_att=44.522, acc=0.961, loss=44.522, backward_time=0.338, grad_norm=81.077, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=8.451e-04, train_time=3.131 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:40:33,888 (trainer:732) INFO: 43epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.235, loss_att=45.706, acc=0.960, loss=45.706, backward_time=0.337, grad_norm=91.670, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.446e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:47:22,432 (trainer:732) INFO: 43epoch:train:3181-3710batch: iter_time=7.906e-04, forward_time=0.234, loss_att=44.109, acc=0.960, loss=44.109, backward_time=0.335, grad_norm=86.241, clip=100.000, loss_scale=1.000, optim_step_time=0.182, optim0_lr0=8.441e-04, train_time=3.082 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 20:54:14,126 (trainer:732) INFO: 43epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.235, loss_att=44.906, acc=0.960, loss=44.906, backward_time=0.335, grad_norm=90.529, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.436e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:01:03,247 (trainer:732) INFO: 43epoch:train:4241-4770batch: iter_time=9.120e-04, forward_time=0.234, loss_att=44.094, acc=0.961, loss=44.094, backward_time=0.335, grad_norm=84.956, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.431e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:07:52,953 (trainer:732) INFO: 43epoch:train:4771-5300batch: iter_time=9.435e-04, forward_time=0.235, loss_att=45.091, acc=0.960, loss=45.091, backward_time=0.334, grad_norm=89.753, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.426e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:14:41,404 (trainer:732) INFO: 43epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.234, loss_att=45.192, acc=0.959, loss=45.192, backward_time=0.333, grad_norm=86.489, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.421e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:21:34,369 (trainer:732) INFO: 43epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.235, loss_att=45.790, acc=0.960, loss=45.790, backward_time=0.336, grad_norm=86.416, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.416e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:28:23,099 (trainer:732) INFO: 43epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.233, loss_att=44.130, acc=0.961, loss=44.130, backward_time=0.334, grad_norm=87.968, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.411e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:35:11,471 (trainer:732) INFO: 43epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.232, loss_att=46.293, acc=0.959, loss=46.293, backward_time=0.334, grad_norm=89.853, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.406e-04, train_time=3.079 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:42:00,888 (trainer:732) INFO: 43epoch:train:7421-7950batch: iter_time=9.341e-04, forward_time=0.232, loss_att=45.985, acc=0.960, loss=45.985, backward_time=0.336, grad_norm=86.761, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.401e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:48:49,764 (trainer:732) INFO: 43epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.233, loss_att=45.104, acc=0.960, loss=45.104, backward_time=0.335, grad_norm=86.819, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.396e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 21:55:39,746 (trainer:732) INFO: 43epoch:train:8481-9010batch: iter_time=8.845e-04, forward_time=0.233, loss_att=44.793, acc=0.961, loss=44.793, backward_time=0.336, grad_norm=88.815, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.391e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:02:30,454 (trainer:732) INFO: 43epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.233, loss_att=45.046, acc=0.960, loss=45.046, backward_time=0.333, grad_norm=86.403, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=8.386e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:09:19,763 (trainer:732) INFO: 43epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.233, loss_att=44.708, acc=0.961, loss=44.708, backward_time=0.334, grad_norm=84.935, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.382e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:16:11,150 (trainer:732) INFO: 43epoch:train:10071-10600batch: iter_time=9.611e-04, forward_time=0.234, loss_att=44.691, acc=0.960, loss=44.691, backward_time=0.335, grad_norm=89.738, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.377e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:25:31,169 (trainer:338) INFO: 43epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=44.788, acc=0.960, loss=44.788, backward_time=0.336, grad_norm=87.107, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.423e-04, train_time=3.186, time=2 hours, 21 minutes and 5.89 seconds, total_count=456445, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.383, acc=0.950, cer=0.064, wer=0.185, loss=58.383, time=4 minutes and 10.01 seconds, total_count=516, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 50.11 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:25:42,236 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:25:42,249 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/29epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:25:42,249 (trainer:272) INFO: 44/60epoch started. Estimated time to finish: 1 day, 16 hours and 25 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:36:17,076 (trainer:732) INFO: 44epoch:train:1-530batch: iter_time=0.004, forward_time=0.240, loss_att=43.512, acc=0.962, loss=43.512, backward_time=0.336, grad_norm=88.124, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.372e-04, train_time=4.799 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:43:14,064 (trainer:732) INFO: 44epoch:train:531-1060batch: iter_time=9.233e-04, forward_time=0.240, loss_att=43.684, acc=0.962, loss=43.684, backward_time=0.339, grad_norm=90.642, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=8.367e-04, train_time=3.143 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:50:09,342 (trainer:732) INFO: 44epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.239, loss_att=43.804, acc=0.961, loss=43.804, backward_time=0.338, grad_norm=85.507, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.362e-04, train_time=3.135 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 22:57:03,022 (trainer:732) INFO: 44epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.238, loss_att=43.519, acc=0.961, loss=43.519, backward_time=0.336, grad_norm=92.788, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.357e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:03:51,661 (trainer:732) INFO: 44epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.234, loss_att=43.956, acc=0.961, loss=43.956, backward_time=0.333, grad_norm=93.251, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.352e-04, train_time=3.085 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:10:42,824 (trainer:732) INFO: 44epoch:train:2651-3180batch: iter_time=9.992e-04, forward_time=0.234, loss_att=44.041, acc=0.961, loss=44.041, backward_time=0.335, grad_norm=95.079, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.348e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:17:35,078 (trainer:732) INFO: 44epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.235, loss_att=43.728, acc=0.962, loss=43.728, backward_time=0.337, grad_norm=89.322, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.343e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:24:28,688 (trainer:732) INFO: 44epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.237, loss_att=44.469, acc=0.961, loss=44.469, backward_time=0.336, grad_norm=99.553, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.338e-04, train_time=3.118 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:31:20,783 (trainer:732) INFO: 44epoch:train:4241-4770batch: iter_time=8.587e-04, forward_time=0.237, loss_att=45.023, acc=0.960, loss=45.023, backward_time=0.335, grad_norm=88.336, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=8.333e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:38:13,650 (trainer:732) INFO: 44epoch:train:4771-5300batch: iter_time=8.425e-04, forward_time=0.236, loss_att=44.323, acc=0.961, loss=44.323, backward_time=0.337, grad_norm=86.210, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.328e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:45:05,556 (trainer:732) INFO: 44epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.237, loss_att=44.430, acc=0.960, loss=44.430, backward_time=0.336, grad_norm=83.383, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.324e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:51:56,025 (trainer:732) INFO: 44epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.235, loss_att=44.323, acc=0.961, loss=44.323, backward_time=0.334, grad_norm=85.807, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.319e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-24 23:58:48,600 (trainer:732) INFO: 44epoch:train:6361-6890batch: iter_time=9.137e-04, forward_time=0.236, loss_att=44.813, acc=0.961, loss=44.813, backward_time=0.336, grad_norm=91.692, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.314e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:05:42,361 (trainer:732) INFO: 44epoch:train:6891-7420batch: iter_time=8.923e-04, forward_time=0.238, loss_att=44.370, acc=0.960, loss=44.370, backward_time=0.337, grad_norm=85.083, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.309e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:12:36,665 (trainer:732) INFO: 44epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.237, loss_att=44.822, acc=0.960, loss=44.822, backward_time=0.337, grad_norm=85.551, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=8.304e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:19:29,148 (trainer:732) INFO: 44epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.236, loss_att=45.013, acc=0.961, loss=45.013, backward_time=0.336, grad_norm=88.301, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.300e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:26:18,143 (trainer:732) INFO: 44epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.233, loss_att=43.609, acc=0.960, loss=43.609, backward_time=0.332, grad_norm=91.572, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=8.295e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:33:07,042 (trainer:732) INFO: 44epoch:train:9011-9540batch: iter_time=9.615e-04, forward_time=0.233, loss_att=43.906, acc=0.960, loss=43.906, backward_time=0.332, grad_norm=92.692, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.290e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:39:57,466 (trainer:732) INFO: 44epoch:train:9541-10070batch: iter_time=8.837e-04, forward_time=0.233, loss_att=45.593, acc=0.960, loss=45.593, backward_time=0.335, grad_norm=91.722, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.286e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:46:50,797 (trainer:732) INFO: 44epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.237, loss_att=45.627, acc=0.961, loss=45.627, backward_time=0.338, grad_norm=89.466, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=8.281e-04, train_time=3.117 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:56:24,346 (trainer:338) INFO: 44epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=44.325, acc=0.961, loss=44.325, backward_time=0.336, grad_norm=89.705, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.326e-04, train_time=3.195, time=2 hours, 21 minutes and 29.44 seconds, total_count=467060, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.286, acc=0.949, cer=0.065, wer=0.188, loss=59.286, time=4 minutes and 3.95 seconds, total_count=528, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 8.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:56:36,461 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:56:36,475 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/33epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 00:56:36,476 (trainer:272) INFO: 45/60epoch started. Estimated time to finish: 1 day, 14 hours and 6 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:07:09,565 (trainer:732) INFO: 45epoch:train:1-530batch: iter_time=0.004, forward_time=0.242, loss_att=42.982, acc=0.962, loss=42.982, backward_time=0.340, grad_norm=87.258, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.276e-04, train_time=4.784 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:14:08,201 (trainer:732) INFO: 45epoch:train:531-1060batch: iter_time=9.536e-04, forward_time=0.243, loss_att=42.878, acc=0.962, loss=42.878, backward_time=0.340, grad_norm=94.601, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=8.271e-04, train_time=3.157 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:21:07,274 (trainer:732) INFO: 45epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.241, loss_att=43.188, acc=0.962, loss=43.188, backward_time=0.340, grad_norm=86.775, clip=100.000, loss_scale=1.000, optim_step_time=0.209, optim0_lr0=8.267e-04, train_time=3.164 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:28:01,675 (trainer:732) INFO: 45epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.237, loss_att=43.347, acc=0.962, loss=43.347, backward_time=0.337, grad_norm=84.819, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.262e-04, train_time=3.125 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:34:55,988 (trainer:732) INFO: 45epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=44.203, acc=0.960, loss=44.203, backward_time=0.337, grad_norm=94.948, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=8.257e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:41:52,282 (trainer:732) INFO: 45epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.239, loss_att=43.693, acc=0.961, loss=43.693, backward_time=0.338, grad_norm=88.149, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=8.253e-04, train_time=3.139 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:48:50,190 (trainer:732) INFO: 45epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.240, loss_att=44.986, acc=0.960, loss=44.986, backward_time=0.340, grad_norm=90.321, clip=100.000, loss_scale=1.000, optim_step_time=0.208, optim0_lr0=8.248e-04, train_time=3.155 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 01:55:43,480 (trainer:732) INFO: 45epoch:train:3711-4240batch: iter_time=7.871e-04, forward_time=0.238, loss_att=43.840, acc=0.961, loss=43.840, backward_time=0.337, grad_norm=98.748, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.243e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:02:34,989 (trainer:732) INFO: 45epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.237, loss_att=44.233, acc=0.961, loss=44.233, backward_time=0.334, grad_norm=87.621, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.239e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:09:28,738 (trainer:732) INFO: 45epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.235, loss_att=43.826, acc=0.961, loss=43.826, backward_time=0.336, grad_norm=90.983, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=8.234e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:16:19,961 (trainer:732) INFO: 45epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.235, loss_att=44.005, acc=0.961, loss=44.005, backward_time=0.335, grad_norm=90.103, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.230e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:23:09,509 (trainer:732) INFO: 45epoch:train:5831-6360batch: iter_time=9.722e-04, forward_time=0.234, loss_att=44.603, acc=0.961, loss=44.603, backward_time=0.334, grad_norm=87.662, clip=100.000, loss_scale=1.000, optim_step_time=0.183, optim0_lr0=8.225e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:29:58,987 (trainer:732) INFO: 45epoch:train:6361-6890batch: iter_time=9.348e-04, forward_time=0.234, loss_att=44.685, acc=0.961, loss=44.685, backward_time=0.334, grad_norm=87.388, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.220e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:36:49,153 (trainer:732) INFO: 45epoch:train:6891-7420batch: iter_time=8.815e-04, forward_time=0.234, loss_att=43.994, acc=0.961, loss=43.994, backward_time=0.334, grad_norm=92.957, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.216e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:43:38,442 (trainer:732) INFO: 45epoch:train:7421-7950batch: iter_time=9.939e-04, forward_time=0.233, loss_att=43.916, acc=0.962, loss=43.916, backward_time=0.335, grad_norm=90.082, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.211e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:50:29,631 (trainer:732) INFO: 45epoch:train:7951-8480batch: iter_time=9.382e-04, forward_time=0.234, loss_att=44.047, acc=0.961, loss=44.047, backward_time=0.334, grad_norm=90.066, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=8.207e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 02:57:19,412 (trainer:732) INFO: 45epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.233, loss_att=44.421, acc=0.961, loss=44.421, backward_time=0.335, grad_norm=88.138, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.202e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:04:06,688 (trainer:732) INFO: 45epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.231, loss_att=43.930, acc=0.961, loss=43.930, backward_time=0.334, grad_norm=87.052, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.197e-04, train_time=3.071 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:10:54,886 (trainer:732) INFO: 45epoch:train:9541-10070batch: iter_time=9.922e-04, forward_time=0.234, loss_att=44.201, acc=0.961, loss=44.201, backward_time=0.333, grad_norm=82.087, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.193e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:17:45,089 (trainer:732) INFO: 45epoch:train:10071-10600batch: iter_time=9.253e-04, forward_time=0.233, loss_att=43.704, acc=0.961, loss=43.704, backward_time=0.334, grad_norm=90.559, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.188e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:26:57,582 (trainer:338) INFO: 45epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=43.951, acc=0.961, loss=43.951, backward_time=0.336, grad_norm=89.547, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.232e-04, train_time=3.195, time=2 hours, 21 minutes and 28.61 seconds, total_count=477675, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.809, acc=0.950, cer=0.065, wer=0.187, loss=58.809, time=3 minutes and 56.98 seconds, total_count=540, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 55.52 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:27:08,776 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:27:08,790 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/36epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:27:08,791 (trainer:272) INFO: 46/60epoch started. Estimated time to finish: 1 day, 11 hours and 46 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:37:39,015 (trainer:732) INFO: 46epoch:train:1-530batch: iter_time=0.003, forward_time=0.240, loss_att=42.781, acc=0.962, loss=42.781, backward_time=0.339, grad_norm=81.002, clip=100.000, loss_scale=1.000, optim_step_time=0.184, optim0_lr0=8.184e-04, train_time=4.763 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:44:30,843 (trainer:732) INFO: 46epoch:train:531-1060batch: iter_time=0.001, forward_time=0.236, loss_att=42.767, acc=0.962, loss=42.767, backward_time=0.335, grad_norm=100.208, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.179e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:51:23,682 (trainer:732) INFO: 46epoch:train:1061-1590batch: iter_time=8.327e-04, forward_time=0.236, loss_att=42.683, acc=0.962, loss=42.683, backward_time=0.336, grad_norm=89.822, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.175e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 03:58:15,531 (trainer:732) INFO: 46epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.236, loss_att=44.423, acc=0.961, loss=44.423, backward_time=0.336, grad_norm=93.032, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.170e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:05:07,983 (trainer:732) INFO: 46epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.236, loss_att=43.079, acc=0.962, loss=43.079, backward_time=0.336, grad_norm=86.775, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.166e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:12:03,336 (trainer:732) INFO: 46epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.238, loss_att=43.642, acc=0.963, loss=43.642, backward_time=0.339, grad_norm=91.428, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=8.161e-04, train_time=3.132 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:18:53,452 (trainer:732) INFO: 46epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.234, loss_att=42.620, acc=0.961, loss=42.620, backward_time=0.334, grad_norm=90.736, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.157e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:25:43,873 (trainer:732) INFO: 46epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.234, loss_att=43.083, acc=0.961, loss=43.083, backward_time=0.335, grad_norm=81.730, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.152e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:32:33,919 (trainer:732) INFO: 46epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.234, loss_att=43.495, acc=0.961, loss=43.495, backward_time=0.335, grad_norm=83.425, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.148e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:39:22,935 (trainer:732) INFO: 46epoch:train:4771-5300batch: iter_time=9.475e-04, forward_time=0.233, loss_att=42.339, acc=0.962, loss=42.339, backward_time=0.334, grad_norm=86.126, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.143e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:46:13,573 (trainer:732) INFO: 46epoch:train:5301-5830batch: iter_time=9.399e-04, forward_time=0.235, loss_att=43.596, acc=0.961, loss=43.596, backward_time=0.335, grad_norm=86.935, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.139e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:53:03,367 (trainer:732) INFO: 46epoch:train:5831-6360batch: iter_time=8.276e-04, forward_time=0.233, loss_att=44.024, acc=0.961, loss=44.024, backward_time=0.334, grad_norm=94.092, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.134e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 04:59:54,186 (trainer:732) INFO: 46epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.233, loss_att=43.940, acc=0.962, loss=43.940, backward_time=0.333, grad_norm=85.698, clip=100.000, loss_scale=1.000, optim_step_time=0.205, optim0_lr0=8.130e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:06:43,928 (trainer:732) INFO: 46epoch:train:6891-7420batch: iter_time=9.784e-04, forward_time=0.233, loss_att=43.909, acc=0.962, loss=43.909, backward_time=0.336, grad_norm=89.458, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=8.125e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:13:34,747 (trainer:732) INFO: 46epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.235, loss_att=44.219, acc=0.961, loss=44.219, backward_time=0.335, grad_norm=87.940, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.121e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:20:22,620 (trainer:732) INFO: 46epoch:train:7951-8480batch: iter_time=9.455e-04, forward_time=0.233, loss_att=44.110, acc=0.960, loss=44.110, backward_time=0.333, grad_norm=88.669, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.116e-04, train_time=3.076 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:27:10,505 (trainer:732) INFO: 46epoch:train:8481-9010batch: iter_time=9.552e-04, forward_time=0.233, loss_att=43.613, acc=0.961, loss=43.613, backward_time=0.334, grad_norm=85.650, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.112e-04, train_time=3.079 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:34:01,113 (trainer:732) INFO: 46epoch:train:9011-9540batch: iter_time=9.133e-04, forward_time=0.235, loss_att=43.627, acc=0.961, loss=43.627, backward_time=0.336, grad_norm=85.343, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=8.108e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:40:53,240 (trainer:732) INFO: 46epoch:train:9541-10070batch: iter_time=9.827e-04, forward_time=0.236, loss_att=43.519, acc=0.961, loss=43.519, backward_time=0.336, grad_norm=95.547, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.103e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:47:44,560 (trainer:732) INFO: 46epoch:train:10071-10600batch: iter_time=8.686e-04, forward_time=0.235, loss_att=44.166, acc=0.961, loss=44.166, backward_time=0.336, grad_norm=94.163, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.099e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:57:04,807 (trainer:338) INFO: 46epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=43.480, acc=0.961, loss=43.480, backward_time=0.335, grad_norm=88.873, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.141e-04, train_time=3.182, time=2 hours, 20 minutes and 56.52 seconds, total_count=488290, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.223, acc=0.950, cer=0.062, wer=0.185, loss=58.223, time=3 minutes and 59.01 seconds, total_count=552, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 0.47 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:57:15,629 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:57:15,643 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/35epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 05:57:15,643 (trainer:272) INFO: 47/60epoch started. Estimated time to finish: 1 day, 9 hours and 25 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:07:47,617 (trainer:732) INFO: 47epoch:train:1-530batch: iter_time=0.004, forward_time=0.239, loss_att=42.057, acc=0.963, loss=42.057, backward_time=0.338, grad_norm=91.358, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.094e-04, train_time=4.775 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:14:42,284 (trainer:732) INFO: 47epoch:train:531-1060batch: iter_time=9.544e-04, forward_time=0.238, loss_att=43.225, acc=0.962, loss=43.225, backward_time=0.339, grad_norm=88.393, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.090e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:21:32,167 (trainer:732) INFO: 47epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.237, loss_att=42.544, acc=0.961, loss=42.544, backward_time=0.335, grad_norm=85.881, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=8.085e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:28:22,748 (trainer:732) INFO: 47epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.234, loss_att=42.914, acc=0.963, loss=42.914, backward_time=0.337, grad_norm=85.772, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=8.081e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:35:14,105 (trainer:732) INFO: 47epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.235, loss_att=43.409, acc=0.962, loss=43.409, backward_time=0.336, grad_norm=86.292, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.077e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:42:06,753 (trainer:732) INFO: 47epoch:train:2651-3180batch: iter_time=8.378e-04, forward_time=0.236, loss_att=42.820, acc=0.962, loss=42.820, backward_time=0.337, grad_norm=85.561, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.072e-04, train_time=3.112 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:48:55,486 (trainer:732) INFO: 47epoch:train:3181-3710batch: iter_time=9.422e-04, forward_time=0.233, loss_att=43.266, acc=0.961, loss=43.266, backward_time=0.334, grad_norm=86.828, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.068e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 06:55:48,092 (trainer:732) INFO: 47epoch:train:3711-4240batch: iter_time=9.293e-04, forward_time=0.235, loss_att=43.341, acc=0.963, loss=43.341, backward_time=0.337, grad_norm=97.805, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=8.064e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:02:37,182 (trainer:732) INFO: 47epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.234, loss_att=42.743, acc=0.962, loss=42.743, backward_time=0.334, grad_norm=89.605, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.059e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:09:26,259 (trainer:732) INFO: 47epoch:train:4771-5300batch: iter_time=9.364e-04, forward_time=0.233, loss_att=43.573, acc=0.961, loss=43.573, backward_time=0.334, grad_norm=93.410, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=8.055e-04, train_time=3.085 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:16:13,620 (trainer:732) INFO: 47epoch:train:5301-5830batch: iter_time=9.250e-04, forward_time=0.232, loss_att=44.030, acc=0.961, loss=44.030, backward_time=0.333, grad_norm=90.562, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.051e-04, train_time=3.075 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:23:03,584 (trainer:732) INFO: 47epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.233, loss_att=42.906, acc=0.962, loss=42.906, backward_time=0.336, grad_norm=84.555, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.046e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:29:53,346 (trainer:732) INFO: 47epoch:train:6361-6890batch: iter_time=8.604e-04, forward_time=0.233, loss_att=43.197, acc=0.962, loss=43.197, backward_time=0.335, grad_norm=88.605, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.042e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:36:45,398 (trainer:732) INFO: 47epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.234, loss_att=42.227, acc=0.962, loss=42.227, backward_time=0.335, grad_norm=88.682, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=8.038e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:43:33,472 (trainer:732) INFO: 47epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.232, loss_att=43.082, acc=0.962, loss=43.082, backward_time=0.333, grad_norm=84.038, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.033e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:50:24,128 (trainer:732) INFO: 47epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.235, loss_att=43.099, acc=0.962, loss=43.099, backward_time=0.336, grad_norm=91.294, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=8.029e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 07:57:12,640 (trainer:732) INFO: 47epoch:train:8481-9010batch: iter_time=8.490e-04, forward_time=0.233, loss_att=42.469, acc=0.962, loss=42.469, backward_time=0.335, grad_norm=97.821, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=8.025e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:04:04,717 (trainer:732) INFO: 47epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=43.518, acc=0.961, loss=43.518, backward_time=0.336, grad_norm=85.709, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=8.021e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:10:55,413 (trainer:732) INFO: 47epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.235, loss_att=43.978, acc=0.961, loss=43.978, backward_time=0.335, grad_norm=84.941, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=8.016e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:17:46,782 (trainer:732) INFO: 47epoch:train:10071-10600batch: iter_time=8.723e-04, forward_time=0.235, loss_att=42.491, acc=0.962, loss=42.491, backward_time=0.337, grad_norm=88.904, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=8.012e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:27:05,686 (trainer:338) INFO: 47epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=43.053, acc=0.962, loss=43.053, backward_time=0.336, grad_norm=88.836, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=8.053e-04, train_time=3.181, time=2 hours, 20 minutes and 52.04 seconds, total_count=498905, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.768, acc=0.950, cer=0.064, wer=0.186, loss=58.768, time=3 minutes and 57.55 seconds, total_count=564, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 0.45 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:27:15,563 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:27:15,586 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/38epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:27:15,586 (trainer:272) INFO: 48/60epoch started. Estimated time to finish: 1 day, 7 hours and 4 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:37:42,838 (trainer:732) INFO: 48epoch:train:1-530batch: iter_time=0.004, forward_time=0.239, loss_att=41.122, acc=0.963, loss=41.122, backward_time=0.337, grad_norm=89.829, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=8.008e-04, train_time=4.740 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:44:37,256 (trainer:732) INFO: 48epoch:train:531-1060batch: iter_time=0.001, forward_time=0.238, loss_att=42.225, acc=0.963, loss=42.225, backward_time=0.336, grad_norm=85.680, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=8.003e-04, train_time=3.125 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:51:29,787 (trainer:732) INFO: 48epoch:train:1061-1590batch: iter_time=8.746e-04, forward_time=0.238, loss_att=42.951, acc=0.962, loss=42.951, backward_time=0.336, grad_norm=90.514, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.999e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 08:58:23,718 (trainer:732) INFO: 48epoch:train:1591-2120batch: iter_time=7.897e-04, forward_time=0.238, loss_att=41.933, acc=0.963, loss=41.933, backward_time=0.338, grad_norm=89.971, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.995e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:05:15,071 (trainer:732) INFO: 48epoch:train:2121-2650batch: iter_time=9.205e-04, forward_time=0.236, loss_att=42.609, acc=0.962, loss=42.609, backward_time=0.336, grad_norm=87.740, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.991e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:12:06,404 (trainer:732) INFO: 48epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.236, loss_att=43.063, acc=0.962, loss=43.063, backward_time=0.336, grad_norm=92.187, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.987e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:18:57,139 (trainer:732) INFO: 48epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.235, loss_att=42.785, acc=0.962, loss=42.785, backward_time=0.335, grad_norm=91.411, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.982e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:25:49,079 (trainer:732) INFO: 48epoch:train:3711-4240batch: iter_time=7.923e-04, forward_time=0.235, loss_att=42.018, acc=0.962, loss=42.018, backward_time=0.335, grad_norm=87.878, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.978e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:32:39,139 (trainer:732) INFO: 48epoch:train:4241-4770batch: iter_time=9.700e-04, forward_time=0.234, loss_att=42.876, acc=0.962, loss=42.876, backward_time=0.335, grad_norm=85.491, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.974e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:39:30,015 (trainer:732) INFO: 48epoch:train:4771-5300batch: iter_time=8.927e-04, forward_time=0.233, loss_att=42.331, acc=0.963, loss=42.331, backward_time=0.336, grad_norm=89.921, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.970e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:46:21,999 (trainer:732) INFO: 48epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.234, loss_att=42.659, acc=0.962, loss=42.659, backward_time=0.335, grad_norm=88.174, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.966e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 09:53:15,392 (trainer:732) INFO: 48epoch:train:5831-6360batch: iter_time=9.012e-04, forward_time=0.236, loss_att=42.378, acc=0.962, loss=42.378, backward_time=0.336, grad_norm=90.814, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=7.961e-04, train_time=3.118 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:00:06,482 (trainer:732) INFO: 48epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.237, loss_att=43.335, acc=0.962, loss=43.335, backward_time=0.337, grad_norm=88.553, clip=100.000, loss_scale=1.000, optim_step_time=0.183, optim0_lr0=7.957e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:06:58,671 (trainer:732) INFO: 48epoch:train:6891-7420batch: iter_time=9.301e-04, forward_time=0.237, loss_att=41.877, acc=0.962, loss=41.877, backward_time=0.336, grad_norm=91.798, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.953e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:13:50,352 (trainer:732) INFO: 48epoch:train:7421-7950batch: iter_time=9.090e-04, forward_time=0.234, loss_att=43.650, acc=0.962, loss=43.650, backward_time=0.337, grad_norm=83.786, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.949e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:20:43,637 (trainer:732) INFO: 48epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.236, loss_att=44.165, acc=0.961, loss=44.165, backward_time=0.336, grad_norm=86.923, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.945e-04, train_time=3.118 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:27:36,155 (trainer:732) INFO: 48epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.235, loss_att=43.332, acc=0.962, loss=43.332, backward_time=0.336, grad_norm=91.422, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.941e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:34:24,239 (trainer:732) INFO: 48epoch:train:9011-9540batch: iter_time=9.508e-04, forward_time=0.232, loss_att=42.447, acc=0.962, loss=42.447, backward_time=0.333, grad_norm=90.831, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.936e-04, train_time=3.078 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:41:18,786 (trainer:732) INFO: 48epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.237, loss_att=43.218, acc=0.963, loss=43.218, backward_time=0.335, grad_norm=87.748, clip=100.000, loss_scale=1.000, optim_step_time=0.211, optim0_lr0=7.932e-04, train_time=3.129 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:48:11,247 (trainer:732) INFO: 48epoch:train:10071-10600batch: iter_time=8.547e-04, forward_time=0.237, loss_att=43.991, acc=0.961, loss=43.991, backward_time=0.336, grad_norm=87.300, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.928e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:57:40,415 (trainer:338) INFO: 48epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=42.733, acc=0.962, loss=42.733, backward_time=0.336, grad_norm=88.883, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.968e-04, train_time=3.190, time=2 hours, 21 minutes and 15.55 seconds, total_count=509520, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.304, acc=0.950, cer=0.063, wer=0.185, loss=58.304, time=3 minutes and 57.55 seconds, total_count=576, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 11.72 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:57:49,943 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:57:49,957 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/40epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 10:57:49,958 (trainer:272) INFO: 49/60epoch started. Estimated time to finish: 1 day, 4 hours and 43 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:08:24,560 (trainer:732) INFO: 49epoch:train:1-530batch: iter_time=0.003, forward_time=0.240, loss_att=41.565, acc=0.963, loss=41.565, backward_time=0.339, grad_norm=90.949, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.924e-04, train_time=4.797 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:15:16,839 (trainer:732) INFO: 49epoch:train:531-1060batch: iter_time=0.001, forward_time=0.236, loss_att=41.497, acc=0.963, loss=41.497, backward_time=0.336, grad_norm=95.825, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.920e-04, train_time=3.109 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:22:07,360 (trainer:732) INFO: 49epoch:train:1061-1590batch: iter_time=9.517e-04, forward_time=0.235, loss_att=40.564, acc=0.964, loss=40.564, backward_time=0.335, grad_norm=90.310, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.916e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:28:59,629 (trainer:732) INFO: 49epoch:train:1591-2120batch: iter_time=8.611e-04, forward_time=0.236, loss_att=42.195, acc=0.963, loss=42.195, backward_time=0.337, grad_norm=88.152, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.912e-04, train_time=3.109 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:35:51,278 (trainer:732) INFO: 49epoch:train:2121-2650batch: iter_time=8.787e-04, forward_time=0.236, loss_att=41.832, acc=0.963, loss=41.832, backward_time=0.337, grad_norm=90.214, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.908e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:42:42,922 (trainer:732) INFO: 49epoch:train:2651-3180batch: iter_time=9.007e-04, forward_time=0.236, loss_att=41.321, acc=0.963, loss=41.321, backward_time=0.335, grad_norm=86.419, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.903e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:49:33,659 (trainer:732) INFO: 49epoch:train:3181-3710batch: iter_time=9.274e-04, forward_time=0.235, loss_att=42.366, acc=0.962, loss=42.366, backward_time=0.335, grad_norm=85.321, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.899e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 11:56:25,546 (trainer:732) INFO: 49epoch:train:3711-4240batch: iter_time=9.560e-04, forward_time=0.235, loss_att=42.563, acc=0.963, loss=42.563, backward_time=0.336, grad_norm=83.328, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.895e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:03:13,656 (trainer:732) INFO: 49epoch:train:4241-4770batch: iter_time=9.160e-04, forward_time=0.233, loss_att=42.366, acc=0.962, loss=42.366, backward_time=0.334, grad_norm=84.124, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.891e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:10:02,884 (trainer:732) INFO: 49epoch:train:4771-5300batch: iter_time=9.984e-04, forward_time=0.233, loss_att=42.486, acc=0.963, loss=42.486, backward_time=0.336, grad_norm=96.730, clip=100.000, loss_scale=1.000, optim_step_time=0.179, optim0_lr0=7.887e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:16:52,819 (trainer:732) INFO: 49epoch:train:5301-5830batch: iter_time=9.771e-04, forward_time=0.235, loss_att=42.418, acc=0.963, loss=42.418, backward_time=0.335, grad_norm=90.674, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.883e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:23:43,887 (trainer:732) INFO: 49epoch:train:5831-6360batch: iter_time=9.306e-04, forward_time=0.234, loss_att=41.595, acc=0.963, loss=41.595, backward_time=0.336, grad_norm=84.024, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.879e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:30:34,088 (trainer:732) INFO: 49epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.235, loss_att=42.853, acc=0.962, loss=42.853, backward_time=0.336, grad_norm=85.196, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.875e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:37:23,479 (trainer:732) INFO: 49epoch:train:6891-7420batch: iter_time=9.198e-04, forward_time=0.234, loss_att=43.268, acc=0.962, loss=43.268, backward_time=0.335, grad_norm=88.265, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.871e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:44:14,197 (trainer:732) INFO: 49epoch:train:7421-7950batch: iter_time=8.711e-04, forward_time=0.236, loss_att=42.947, acc=0.962, loss=42.947, backward_time=0.334, grad_norm=92.055, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.867e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:51:05,851 (trainer:732) INFO: 49epoch:train:7951-8480batch: iter_time=8.810e-04, forward_time=0.236, loss_att=44.083, acc=0.961, loss=44.083, backward_time=0.336, grad_norm=96.998, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.863e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 12:57:57,401 (trainer:732) INFO: 49epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.236, loss_att=42.611, acc=0.962, loss=42.611, backward_time=0.335, grad_norm=88.523, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.859e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:04:47,226 (trainer:732) INFO: 49epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=41.667, acc=0.962, loss=41.667, backward_time=0.335, grad_norm=91.016, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.855e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:11:37,110 (trainer:732) INFO: 49epoch:train:9541-10070batch: iter_time=9.288e-04, forward_time=0.234, loss_att=42.796, acc=0.962, loss=42.796, backward_time=0.335, grad_norm=86.822, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.851e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:18:26,579 (trainer:732) INFO: 49epoch:train:10071-10600batch: iter_time=9.386e-04, forward_time=0.232, loss_att=42.270, acc=0.962, loss=42.270, backward_time=0.334, grad_norm=89.161, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.847e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:27:03,116 (trainer:338) INFO: 49epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=42.253, acc=0.962, loss=42.253, backward_time=0.335, grad_norm=89.177, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.885e-04, train_time=3.183, time=2 hours, 20 minutes and 55.8 seconds, total_count=520135, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.796, acc=0.951, cer=0.064, wer=0.183, loss=58.796, time=3 minutes and 57.06 seconds, total_count=588, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 20.29 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:27:09,183 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:27:09,198 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/37epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:27:09,198 (trainer:272) INFO: 50/60epoch started. Estimated time to finish: 1 day, 2 hours and 21 minutes +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:37:29,179 (trainer:732) INFO: 50epoch:train:1-530batch: iter_time=0.002, forward_time=0.241, loss_att=40.804, acc=0.964, loss=40.804, backward_time=0.339, grad_norm=86.102, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.843e-04, train_time=4.685 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:44:24,952 (trainer:732) INFO: 50epoch:train:531-1060batch: iter_time=0.001, forward_time=0.240, loss_att=40.106, acc=0.964, loss=40.106, backward_time=0.338, grad_norm=84.416, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.839e-04, train_time=3.135 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:51:18,476 (trainer:732) INFO: 50epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.238, loss_att=41.421, acc=0.963, loss=41.421, backward_time=0.337, grad_norm=84.923, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.835e-04, train_time=3.121 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 13:58:11,502 (trainer:732) INFO: 50epoch:train:1591-2120batch: iter_time=9.701e-04, forward_time=0.237, loss_att=41.101, acc=0.963, loss=41.101, backward_time=0.337, grad_norm=89.017, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.831e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:05:03,958 (trainer:732) INFO: 50epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=41.034, acc=0.963, loss=41.034, backward_time=0.337, grad_norm=86.545, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.827e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:11:56,923 (trainer:732) INFO: 50epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.237, loss_att=42.362, acc=0.962, loss=42.362, backward_time=0.336, grad_norm=88.614, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.823e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:18:47,423 (trainer:732) INFO: 50epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.235, loss_att=42.220, acc=0.962, loss=42.220, backward_time=0.334, grad_norm=87.404, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.819e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:25:39,196 (trainer:732) INFO: 50epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.236, loss_att=41.437, acc=0.963, loss=41.437, backward_time=0.337, grad_norm=88.966, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.815e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:32:31,015 (trainer:732) INFO: 50epoch:train:4241-4770batch: iter_time=8.677e-04, forward_time=0.236, loss_att=42.571, acc=0.963, loss=42.571, backward_time=0.337, grad_norm=87.164, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.811e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:39:21,337 (trainer:732) INFO: 50epoch:train:4771-5300batch: iter_time=8.848e-04, forward_time=0.235, loss_att=41.596, acc=0.963, loss=41.596, backward_time=0.335, grad_norm=91.814, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.807e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:46:09,993 (trainer:732) INFO: 50epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.233, loss_att=42.806, acc=0.962, loss=42.806, backward_time=0.334, grad_norm=81.136, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.803e-04, train_time=3.084 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:53:01,180 (trainer:732) INFO: 50epoch:train:5831-6360batch: iter_time=9.733e-04, forward_time=0.234, loss_att=41.102, acc=0.963, loss=41.102, backward_time=0.337, grad_norm=81.646, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.799e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 14:59:51,365 (trainer:732) INFO: 50epoch:train:6361-6890batch: iter_time=9.144e-04, forward_time=0.235, loss_att=41.812, acc=0.963, loss=41.812, backward_time=0.335, grad_norm=92.203, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.795e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:06:46,057 (trainer:732) INFO: 50epoch:train:6891-7420batch: iter_time=8.829e-04, forward_time=0.238, loss_att=41.610, acc=0.964, loss=41.610, backward_time=0.338, grad_norm=94.538, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.791e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:13:36,886 (trainer:732) INFO: 50epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.235, loss_att=42.850, acc=0.962, loss=42.850, backward_time=0.336, grad_norm=91.026, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.787e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:20:28,563 (trainer:732) INFO: 50epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.235, loss_att=42.252, acc=0.963, loss=42.252, backward_time=0.337, grad_norm=97.705, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.783e-04, train_time=3.105 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<18454> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:27:19,084 (trainer:732) INFO: 50epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.234, loss_att=43.374, acc=0.962, loss=43.374, backward_time=0.334, grad_norm=83.305, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.780e-04, train_time=3.099 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:34:07,619 (trainer:732) INFO: 50epoch:train:9011-9540batch: iter_time=9.761e-04, forward_time=0.234, loss_att=42.719, acc=0.962, loss=42.719, backward_time=0.334, grad_norm=96.422, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.776e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:40:57,833 (trainer:732) INFO: 50epoch:train:9541-10070batch: iter_time=8.890e-04, forward_time=0.234, loss_att=42.223, acc=0.963, loss=42.223, backward_time=0.336, grad_norm=90.932, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.772e-04, train_time=3.097 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:47:48,606 (trainer:732) INFO: 50epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.234, loss_att=42.092, acc=0.962, loss=42.092, backward_time=0.334, grad_norm=88.182, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.768e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:56:58,171 (trainer:338) INFO: 50epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=41.876, acc=0.963, loss=41.876, backward_time=0.336, grad_norm=88.624, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.805e-04, train_time=3.184, time=2 hours, 20 minutes and 59.28 seconds, total_count=530750, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.415, acc=0.950, cer=0.064, wer=0.184, loss=58.415, time=3 minutes and 50.71 seconds, total_count=600, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 58.98 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:57:08,692 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:57:08,707 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/42epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 15:57:08,708 (trainer:272) INFO: 51/60epoch started. Estimated time to finish: 23 hours, 59 minutes and 13.12 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:07:40,813 (trainer:732) INFO: 51epoch:train:1-530batch: iter_time=0.003, forward_time=0.242, loss_att=40.761, acc=0.963, loss=40.761, backward_time=0.338, grad_norm=86.600, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=7.764e-04, train_time=4.776 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:14:35,272 (trainer:732) INFO: 51epoch:train:531-1060batch: iter_time=0.001, forward_time=0.239, loss_att=40.760, acc=0.964, loss=40.760, backward_time=0.338, grad_norm=86.976, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.760e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:21:26,659 (trainer:732) INFO: 51epoch:train:1061-1590batch: iter_time=8.914e-04, forward_time=0.236, loss_att=40.925, acc=0.964, loss=40.925, backward_time=0.337, grad_norm=87.240, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.756e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:28:20,876 (trainer:732) INFO: 51epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.237, loss_att=41.606, acc=0.963, loss=41.606, backward_time=0.338, grad_norm=83.587, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.752e-04, train_time=3.123 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:35:13,823 (trainer:732) INFO: 51epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=40.371, acc=0.964, loss=40.371, backward_time=0.336, grad_norm=85.509, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.748e-04, train_time=3.117 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:42:04,162 (trainer:732) INFO: 51epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.235, loss_att=41.492, acc=0.963, loss=41.492, backward_time=0.334, grad_norm=82.089, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.745e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:48:56,216 (trainer:732) INFO: 51epoch:train:3181-3710batch: iter_time=9.143e-04, forward_time=0.235, loss_att=41.361, acc=0.963, loss=41.361, backward_time=0.337, grad_norm=91.252, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.741e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 16:55:46,728 (trainer:732) INFO: 51epoch:train:3711-4240batch: iter_time=9.566e-04, forward_time=0.235, loss_att=41.052, acc=0.963, loss=41.052, backward_time=0.334, grad_norm=90.911, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.737e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:02:39,518 (trainer:732) INFO: 51epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.237, loss_att=40.798, acc=0.964, loss=40.798, backward_time=0.337, grad_norm=84.786, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.733e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:09:33,689 (trainer:732) INFO: 51epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.237, loss_att=42.398, acc=0.963, loss=42.398, backward_time=0.337, grad_norm=105.766, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.729e-04, train_time=3.123 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:16:21,771 (trainer:732) INFO: 51epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.233, loss_att=41.599, acc=0.963, loss=41.599, backward_time=0.334, grad_norm=90.308, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.725e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:23:10,416 (trainer:732) INFO: 51epoch:train:5831-6360batch: iter_time=8.787e-04, forward_time=0.233, loss_att=41.782, acc=0.963, loss=41.782, backward_time=0.334, grad_norm=93.931, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.722e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:29:58,899 (trainer:732) INFO: 51epoch:train:6361-6890batch: iter_time=9.683e-04, forward_time=0.232, loss_att=41.764, acc=0.963, loss=41.764, backward_time=0.335, grad_norm=96.358, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.718e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:36:47,703 (trainer:732) INFO: 51epoch:train:6891-7420batch: iter_time=8.979e-04, forward_time=0.232, loss_att=41.102, acc=0.963, loss=41.102, backward_time=0.334, grad_norm=89.525, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.714e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:43:35,609 (trainer:732) INFO: 51epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.232, loss_att=41.208, acc=0.962, loss=41.208, backward_time=0.335, grad_norm=93.867, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.710e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:50:23,260 (trainer:732) INFO: 51epoch:train:7951-8480batch: iter_time=9.993e-04, forward_time=0.231, loss_att=42.184, acc=0.962, loss=42.184, backward_time=0.334, grad_norm=97.538, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.706e-04, train_time=3.074 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 17:57:11,305 (trainer:732) INFO: 51epoch:train:8481-9010batch: iter_time=8.008e-04, forward_time=0.232, loss_att=42.351, acc=0.963, loss=42.351, backward_time=0.333, grad_norm=92.753, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.703e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:04:02,448 (trainer:732) INFO: 51epoch:train:9011-9540batch: iter_time=9.681e-04, forward_time=0.234, loss_att=42.206, acc=0.964, loss=42.206, backward_time=0.336, grad_norm=92.695, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.699e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:10:53,064 (trainer:732) INFO: 51epoch:train:9541-10070batch: iter_time=8.794e-04, forward_time=0.235, loss_att=42.735, acc=0.963, loss=42.735, backward_time=0.337, grad_norm=95.919, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.695e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:17:40,259 (trainer:732) INFO: 51epoch:train:10071-10600batch: iter_time=8.835e-04, forward_time=0.232, loss_att=41.930, acc=0.963, loss=41.930, backward_time=0.334, grad_norm=90.325, clip=100.000, loss_scale=1.000, optim_step_time=0.181, optim0_lr0=7.691e-04, train_time=3.070 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:26:54,729 (trainer:338) INFO: 51epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=41.513, acc=0.963, loss=41.513, backward_time=0.336, grad_norm=90.918, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.727e-04, train_time=3.181, time=2 hours, 20 minutes and 51.75 seconds, total_count=541365, gpu_max_cached_mem_GB=30.221, [valid] loss_att=59.311, acc=0.950, cer=0.062, wer=0.185, loss=59.311, time=3 minutes and 51.43 seconds, total_count=612, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 2.84 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:27:04,787 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:27:04,812 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/39epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:27:04,823 (trainer:272) INFO: 52/60epoch started. Estimated time to finish: 21 hours, 36 minutes and 33.32 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:37:33,033 (trainer:732) INFO: 52epoch:train:1-530batch: iter_time=0.003, forward_time=0.238, loss_att=39.575, acc=0.964, loss=39.575, backward_time=0.337, grad_norm=96.594, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.687e-04, train_time=4.748 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:44:24,730 (trainer:732) INFO: 52epoch:train:531-1060batch: iter_time=9.031e-04, forward_time=0.236, loss_att=41.235, acc=0.963, loss=41.235, backward_time=0.336, grad_norm=88.594, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.684e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:51:14,148 (trainer:732) INFO: 52epoch:train:1061-1590batch: iter_time=8.284e-04, forward_time=0.235, loss_att=41.236, acc=0.963, loss=41.236, backward_time=0.336, grad_norm=85.267, clip=100.000, loss_scale=1.000, optim_step_time=0.183, optim0_lr0=7.680e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 18:58:04,979 (trainer:732) INFO: 52epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.236, loss_att=41.019, acc=0.964, loss=41.019, backward_time=0.335, grad_norm=89.109, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.676e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:04:56,635 (trainer:732) INFO: 52epoch:train:2121-2650batch: iter_time=9.570e-04, forward_time=0.235, loss_att=41.282, acc=0.963, loss=41.282, backward_time=0.336, grad_norm=93.675, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.672e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:11:48,573 (trainer:732) INFO: 52epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.234, loss_att=41.636, acc=0.964, loss=41.636, backward_time=0.336, grad_norm=85.382, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.669e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:18:37,617 (trainer:732) INFO: 52epoch:train:3181-3710batch: iter_time=7.454e-04, forward_time=0.235, loss_att=41.396, acc=0.963, loss=41.396, backward_time=0.335, grad_norm=87.327, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.665e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:25:27,848 (trainer:732) INFO: 52epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.234, loss_att=42.036, acc=0.963, loss=42.036, backward_time=0.335, grad_norm=91.760, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.661e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:32:18,521 (trainer:732) INFO: 52epoch:train:4241-4770batch: iter_time=9.340e-04, forward_time=0.234, loss_att=41.928, acc=0.964, loss=41.928, backward_time=0.336, grad_norm=88.537, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.658e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:39:09,837 (trainer:732) INFO: 52epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.233, loss_att=40.990, acc=0.964, loss=40.990, backward_time=0.336, grad_norm=86.570, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.654e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:46:00,199 (trainer:732) INFO: 52epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.235, loss_att=41.224, acc=0.963, loss=41.224, backward_time=0.334, grad_norm=84.905, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.650e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:52:51,698 (trainer:732) INFO: 52epoch:train:5831-6360batch: iter_time=8.774e-04, forward_time=0.236, loss_att=41.954, acc=0.963, loss=41.954, backward_time=0.337, grad_norm=86.100, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.646e-04, train_time=3.104 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 19:59:41,748 (trainer:732) INFO: 52epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.235, loss_att=41.681, acc=0.963, loss=41.681, backward_time=0.336, grad_norm=91.116, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=7.643e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:06:33,259 (trainer:732) INFO: 52epoch:train:6891-7420batch: iter_time=9.804e-04, forward_time=0.235, loss_att=40.794, acc=0.963, loss=40.794, backward_time=0.334, grad_norm=90.581, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.639e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:13:22,525 (trainer:732) INFO: 52epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.233, loss_att=41.634, acc=0.963, loss=41.634, backward_time=0.334, grad_norm=95.214, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.635e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:20:10,689 (trainer:732) INFO: 52epoch:train:7951-8480batch: iter_time=7.298e-04, forward_time=0.232, loss_att=40.371, acc=0.964, loss=40.371, backward_time=0.334, grad_norm=86.807, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.632e-04, train_time=3.078 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:26:59,958 (trainer:732) INFO: 52epoch:train:8481-9010batch: iter_time=6.978e-04, forward_time=0.235, loss_att=41.739, acc=0.963, loss=41.739, backward_time=0.335, grad_norm=94.027, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=7.628e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:33:47,744 (trainer:732) INFO: 52epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=41.601, acc=0.962, loss=41.601, backward_time=0.333, grad_norm=88.309, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.624e-04, train_time=3.074 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:40:37,303 (trainer:732) INFO: 52epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.233, loss_att=41.750, acc=0.964, loss=41.750, backward_time=0.335, grad_norm=84.639, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.621e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:47:26,842 (trainer:732) INFO: 52epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.234, loss_att=41.065, acc=0.963, loss=41.065, backward_time=0.334, grad_norm=90.274, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=7.617e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:56:01,415 (trainer:338) INFO: 52epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=41.308, acc=0.963, loss=41.308, backward_time=0.335, grad_norm=89.224, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.652e-04, train_time=3.177, time=2 hours, 20 minutes and 41.82 seconds, total_count=551980, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.179, acc=0.950, cer=0.064, wer=0.184, loss=58.179, time=3 minutes and 57.35 seconds, total_count=624, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 17.42 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:56:07,674 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:56:07,689 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/44epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 20:56:07,690 (trainer:272) INFO: 53/60epoch started. Estimated time to finish: 19 hours, 13 minutes and 24.01 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:06:27,399 (trainer:732) INFO: 53epoch:train:1-530batch: iter_time=0.003, forward_time=0.240, loss_att=40.683, acc=0.965, loss=40.683, backward_time=0.340, grad_norm=90.658, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.613e-04, train_time=4.684 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:13:20,681 (trainer:732) INFO: 53epoch:train:531-1060batch: iter_time=0.001, forward_time=0.238, loss_att=40.579, acc=0.964, loss=40.579, backward_time=0.338, grad_norm=90.166, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.610e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:20:15,174 (trainer:732) INFO: 53epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.239, loss_att=40.008, acc=0.964, loss=40.008, backward_time=0.337, grad_norm=91.308, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=7.606e-04, train_time=3.129 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:27:08,335 (trainer:732) INFO: 53epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.236, loss_att=39.662, acc=0.965, loss=39.662, backward_time=0.337, grad_norm=85.400, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.602e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:34:00,716 (trainer:732) INFO: 53epoch:train:2121-2650batch: iter_time=9.135e-04, forward_time=0.237, loss_att=39.454, acc=0.965, loss=39.454, backward_time=0.337, grad_norm=84.998, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.599e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:40:53,782 (trainer:732) INFO: 53epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.235, loss_att=41.299, acc=0.963, loss=41.299, backward_time=0.337, grad_norm=83.464, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.595e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:47:43,389 (trainer:732) INFO: 53epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.235, loss_att=41.386, acc=0.963, loss=41.386, backward_time=0.335, grad_norm=84.634, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.591e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 21:54:35,239 (trainer:732) INFO: 53epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.236, loss_att=40.323, acc=0.964, loss=40.323, backward_time=0.337, grad_norm=83.351, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.588e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:01:28,802 (trainer:732) INFO: 53epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.238, loss_att=40.973, acc=0.964, loss=40.973, backward_time=0.336, grad_norm=85.377, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.584e-04, train_time=3.122 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:08:22,179 (trainer:732) INFO: 53epoch:train:4771-5300batch: iter_time=9.850e-04, forward_time=0.236, loss_att=40.576, acc=0.964, loss=40.576, backward_time=0.336, grad_norm=88.901, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.581e-04, train_time=3.117 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:15:13,537 (trainer:732) INFO: 53epoch:train:5301-5830batch: iter_time=9.149e-04, forward_time=0.234, loss_att=42.054, acc=0.964, loss=42.054, backward_time=0.336, grad_norm=95.100, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.577e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:22:03,923 (trainer:732) INFO: 53epoch:train:5831-6360batch: iter_time=8.905e-04, forward_time=0.235, loss_att=40.453, acc=0.964, loss=40.453, backward_time=0.335, grad_norm=85.524, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.573e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:28:57,071 (trainer:732) INFO: 53epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.238, loss_att=40.036, acc=0.964, loss=40.036, backward_time=0.336, grad_norm=97.951, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.570e-04, train_time=3.118 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:35:48,366 (trainer:732) INFO: 53epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.235, loss_att=41.245, acc=0.963, loss=41.245, backward_time=0.336, grad_norm=90.403, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.566e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:42:39,939 (trainer:732) INFO: 53epoch:train:7421-7950batch: iter_time=8.928e-04, forward_time=0.235, loss_att=41.780, acc=0.963, loss=41.780, backward_time=0.335, grad_norm=88.245, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.563e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:49:34,080 (trainer:732) INFO: 53epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.236, loss_att=41.709, acc=0.963, loss=41.709, backward_time=0.337, grad_norm=89.811, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.559e-04, train_time=3.123 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 22:56:25,918 (trainer:732) INFO: 53epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.235, loss_att=41.568, acc=0.964, loss=41.568, backward_time=0.337, grad_norm=94.428, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.555e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:03:18,013 (trainer:732) INFO: 53epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.236, loss_att=41.898, acc=0.963, loss=41.898, backward_time=0.335, grad_norm=92.457, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.552e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:10:04,879 (trainer:732) INFO: 53epoch:train:9541-10070batch: iter_time=9.171e-04, forward_time=0.232, loss_att=40.904, acc=0.963, loss=40.904, backward_time=0.333, grad_norm=85.349, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.548e-04, train_time=3.072 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:16:53,530 (trainer:732) INFO: 53epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.235, loss_att=41.126, acc=0.962, loss=41.126, backward_time=0.332, grad_norm=85.584, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.545e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:26:24,706 (trainer:338) INFO: 53epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=40.883, acc=0.964, loss=40.883, backward_time=0.336, grad_norm=88.775, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.579e-04, train_time=3.186, time=2 hours, 21 minutes and 7.31 seconds, total_count=562595, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.454, acc=0.951, cer=0.061, wer=0.182, loss=57.454, time=4 minutes and 0.39 seconds, total_count=636, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 9.31 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:26:35,473 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:26:35,488 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/45epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:26:35,489 (trainer:272) INFO: 54/60epoch started. Estimated time to finish: 16 hours, 50 minutes and 12.2 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:37:01,683 (trainer:732) INFO: 54epoch:train:1-530batch: iter_time=0.004, forward_time=0.240, loss_att=39.464, acc=0.965, loss=39.464, backward_time=0.338, grad_norm=83.956, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.541e-04, train_time=4.732 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:43:58,442 (trainer:732) INFO: 54epoch:train:531-1060batch: iter_time=0.001, forward_time=0.241, loss_att=39.809, acc=0.964, loss=39.809, backward_time=0.338, grad_norm=84.437, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.538e-04, train_time=3.143 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:50:54,635 (trainer:732) INFO: 54epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.240, loss_att=40.617, acc=0.965, loss=40.617, backward_time=0.339, grad_norm=90.847, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.534e-04, train_time=3.142 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-25 23:57:45,528 (trainer:732) INFO: 54epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.236, loss_att=40.542, acc=0.964, loss=40.542, backward_time=0.335, grad_norm=87.229, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.530e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:04:38,316 (trainer:732) INFO: 54epoch:train:2121-2650batch: iter_time=8.196e-04, forward_time=0.237, loss_att=39.897, acc=0.963, loss=39.897, backward_time=0.336, grad_norm=87.180, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.527e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:11:33,737 (trainer:732) INFO: 54epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.238, loss_att=40.562, acc=0.964, loss=40.562, backward_time=0.339, grad_norm=85.218, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=7.523e-04, train_time=3.133 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:18:29,262 (trainer:732) INFO: 54epoch:train:3181-3710batch: iter_time=9.148e-04, forward_time=0.239, loss_att=39.771, acc=0.965, loss=39.771, backward_time=0.337, grad_norm=92.319, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=7.520e-04, train_time=3.136 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:25:21,100 (trainer:732) INFO: 54epoch:train:3711-4240batch: iter_time=9.717e-04, forward_time=0.238, loss_att=39.644, acc=0.964, loss=39.644, backward_time=0.334, grad_norm=86.716, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.516e-04, train_time=3.106 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<40539> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<35269> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<40703> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<44101> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<35561> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<39623> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<27859> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<57391> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<48978> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<57575> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<33366> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<33394> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<31410> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 145) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<38072> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<39867> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<47628> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<47630> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<25629> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<26861> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<17783> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<28699> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<28687> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 156) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<40285> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<46035> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 157) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<58540> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:32:13,774 (trainer:732) INFO: 54epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.236, loss_att=40.913, acc=0.964, loss=40.913, backward_time=0.336, grad_norm=85.549, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=7.513e-04, train_time=3.115 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<58778> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<63121> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<30021> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<51236> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<51225> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<38624> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<41634> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 150) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 150) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:39:06,796 (trainer:732) INFO: 54epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.236, loss_att=41.756, acc=0.964, loss=41.756, backward_time=0.336, grad_norm=96.061, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.509e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:45:57,103 (trainer:732) INFO: 54epoch:train:5301-5830batch: iter_time=9.876e-04, forward_time=0.236, loss_att=40.158, acc=0.964, loss=40.158, backward_time=0.335, grad_norm=93.594, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.506e-04, train_time=3.098 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:52:49,590 (trainer:732) INFO: 54epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.235, loss_att=41.131, acc=0.963, loss=41.131, backward_time=0.335, grad_norm=86.935, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=7.502e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 00:59:41,603 (trainer:732) INFO: 54epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.235, loss_att=40.356, acc=0.964, loss=40.356, backward_time=0.337, grad_norm=89.728, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.499e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:06:33,591 (trainer:732) INFO: 54epoch:train:6891-7420batch: iter_time=9.655e-04, forward_time=0.235, loss_att=39.371, acc=0.964, loss=39.371, backward_time=0.335, grad_norm=86.135, clip=100.000, loss_scale=1.000, optim_step_time=0.205, optim0_lr0=7.495e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:13:22,226 (trainer:732) INFO: 54epoch:train:7421-7950batch: iter_time=7.734e-04, forward_time=0.234, loss_att=39.974, acc=0.964, loss=39.974, backward_time=0.333, grad_norm=92.933, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.492e-04, train_time=3.085 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:20:14,109 (trainer:732) INFO: 54epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.235, loss_att=41.536, acc=0.963, loss=41.536, backward_time=0.335, grad_norm=90.559, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.488e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:27:06,590 (trainer:732) INFO: 54epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.234, loss_att=41.766, acc=0.964, loss=41.766, backward_time=0.336, grad_norm=86.034, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=7.485e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:33:58,945 (trainer:732) INFO: 54epoch:train:9011-9540batch: iter_time=9.920e-04, forward_time=0.235, loss_att=41.164, acc=0.964, loss=41.164, backward_time=0.335, grad_norm=97.920, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.481e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:40:52,174 (trainer:732) INFO: 54epoch:train:9541-10070batch: iter_time=9.878e-04, forward_time=0.236, loss_att=41.085, acc=0.964, loss=41.085, backward_time=0.336, grad_norm=89.432, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=7.478e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:47:43,676 (trainer:732) INFO: 54epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.235, loss_att=41.167, acc=0.964, loss=41.167, backward_time=0.335, grad_norm=86.065, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.475e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:57:13,957 (trainer:338) INFO: 54epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=40.528, acc=0.964, loss=40.528, backward_time=0.336, grad_norm=88.913, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.508e-04, train_time=3.194, time=2 hours, 21 minutes and 29.22 seconds, total_count=573210, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.842, acc=0.951, cer=0.062, wer=0.182, loss=57.842, time=4 minutes and 5.65 seconds, total_count=648, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 3.59 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:57:23,116 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:57:23,132 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/46epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 01:57:23,133 (trainer:272) INFO: 55/60epoch started. Estimated time to finish: 14 hours, 26 minutes and 44.02 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:08:05,608 (trainer:732) INFO: 55epoch:train:1-530batch: iter_time=0.005, forward_time=0.242, loss_att=39.172, acc=0.966, loss=39.172, backward_time=0.341, grad_norm=84.048, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.471e-04, train_time=4.855 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:15:02,069 (trainer:732) INFO: 55epoch:train:531-1060batch: iter_time=0.001, forward_time=0.241, loss_att=40.199, acc=0.965, loss=40.199, backward_time=0.337, grad_norm=85.871, clip=100.000, loss_scale=1.000, optim_step_time=0.204, optim0_lr0=7.468e-04, train_time=3.140 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:21:53,745 (trainer:732) INFO: 55epoch:train:1061-1590batch: iter_time=7.525e-04, forward_time=0.236, loss_att=39.616, acc=0.965, loss=39.616, backward_time=0.337, grad_norm=83.353, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.464e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:28:48,885 (trainer:732) INFO: 55epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.237, loss_att=40.542, acc=0.965, loss=40.542, backward_time=0.337, grad_norm=81.969, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.461e-04, train_time=3.130 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:35:40,565 (trainer:732) INFO: 55epoch:train:2121-2650batch: iter_time=9.223e-04, forward_time=0.237, loss_att=40.858, acc=0.964, loss=40.858, backward_time=0.336, grad_norm=93.574, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.457e-04, train_time=3.109 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:42:30,148 (trainer:732) INFO: 55epoch:train:2651-3180batch: iter_time=8.946e-04, forward_time=0.235, loss_att=40.400, acc=0.964, loss=40.400, backward_time=0.335, grad_norm=84.848, clip=100.000, loss_scale=1.000, optim_step_time=0.183, optim0_lr0=7.454e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:49:18,777 (trainer:732) INFO: 55epoch:train:3181-3710batch: iter_time=8.995e-04, forward_time=0.235, loss_att=39.378, acc=0.964, loss=39.378, backward_time=0.335, grad_norm=88.757, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=7.450e-04, train_time=3.085 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 02:56:06,345 (trainer:732) INFO: 55epoch:train:3711-4240batch: iter_time=8.164e-04, forward_time=0.231, loss_att=41.329, acc=0.964, loss=41.329, backward_time=0.333, grad_norm=85.464, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=7.447e-04, train_time=3.074 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:02:55,095 (trainer:732) INFO: 55epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.235, loss_att=40.592, acc=0.964, loss=40.592, backward_time=0.333, grad_norm=89.458, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.444e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:09:46,075 (trainer:732) INFO: 55epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.235, loss_att=40.993, acc=0.963, loss=40.993, backward_time=0.336, grad_norm=93.162, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.440e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:16:38,022 (trainer:732) INFO: 55epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.236, loss_att=39.612, acc=0.964, loss=39.612, backward_time=0.335, grad_norm=89.020, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.437e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:23:27,441 (trainer:732) INFO: 55epoch:train:5831-6360batch: iter_time=9.527e-04, forward_time=0.232, loss_att=40.281, acc=0.964, loss=40.281, backward_time=0.335, grad_norm=99.554, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.433e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:30:16,725 (trainer:732) INFO: 55epoch:train:6361-6890batch: iter_time=7.871e-04, forward_time=0.232, loss_att=39.684, acc=0.964, loss=39.684, backward_time=0.333, grad_norm=83.276, clip=100.000, loss_scale=1.000, optim_step_time=0.200, optim0_lr0=7.430e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:37:08,114 (trainer:732) INFO: 55epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.235, loss_att=40.145, acc=0.965, loss=40.145, backward_time=0.335, grad_norm=88.543, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.426e-04, train_time=3.101 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:44:00,581 (trainer:732) INFO: 55epoch:train:7421-7950batch: iter_time=9.078e-04, forward_time=0.235, loss_att=40.280, acc=0.964, loss=40.280, backward_time=0.336, grad_norm=88.778, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.423e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:50:50,393 (trainer:732) INFO: 55epoch:train:7951-8480batch: iter_time=9.448e-04, forward_time=0.236, loss_att=40.422, acc=0.964, loss=40.422, backward_time=0.335, grad_norm=85.460, clip=100.000, loss_scale=1.000, optim_step_time=0.182, optim0_lr0=7.420e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 03:57:40,190 (trainer:732) INFO: 55epoch:train:8481-9010batch: iter_time=9.483e-04, forward_time=0.234, loss_att=39.810, acc=0.964, loss=39.810, backward_time=0.335, grad_norm=86.895, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.416e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:04:31,203 (trainer:732) INFO: 55epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.234, loss_att=41.045, acc=0.963, loss=41.045, backward_time=0.334, grad_norm=84.457, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.413e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:11:19,235 (trainer:732) INFO: 55epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.233, loss_att=39.701, acc=0.964, loss=39.701, backward_time=0.334, grad_norm=84.857, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=7.410e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:18:06,448 (trainer:732) INFO: 55epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.232, loss_att=41.427, acc=0.963, loss=41.427, backward_time=0.332, grad_norm=87.753, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.406e-04, train_time=3.071 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:27:32,267 (trainer:338) INFO: 55epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=40.282, acc=0.964, loss=40.282, backward_time=0.335, grad_norm=87.446, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.438e-04, train_time=3.185, time=2 hours, 21 minutes and 3 seconds, total_count=583825, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.314, acc=0.951, cer=0.061, wer=0.180, loss=57.314, time=4 minutes and 0.21 seconds, total_count=660, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 5.92 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:27:42,805 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:27:42,821 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/41epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:27:42,822 (trainer:272) INFO: 56/60epoch started. Estimated time to finish: 12 hours, 2 minutes and 54.17 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:38:16,538 (trainer:732) INFO: 56epoch:train:1-530batch: iter_time=0.003, forward_time=0.238, loss_att=39.261, acc=0.965, loss=39.261, backward_time=0.339, grad_norm=86.188, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.403e-04, train_time=4.788 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:45:08,328 (trainer:732) INFO: 56epoch:train:531-1060batch: iter_time=8.520e-04, forward_time=0.237, loss_att=39.184, acc=0.965, loss=39.184, backward_time=0.338, grad_norm=86.712, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.399e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:51:59,976 (trainer:732) INFO: 56epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.236, loss_att=39.096, acc=0.965, loss=39.096, backward_time=0.335, grad_norm=86.689, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.396e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 04:58:52,911 (trainer:732) INFO: 56epoch:train:1591-2120batch: iter_time=0.001, forward_time=0.236, loss_att=39.819, acc=0.965, loss=39.819, backward_time=0.337, grad_norm=95.352, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.393e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:05:44,098 (trainer:732) INFO: 56epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.237, loss_att=39.779, acc=0.965, loss=39.779, backward_time=0.335, grad_norm=87.624, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.389e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:12:35,229 (trainer:732) INFO: 56epoch:train:2651-3180batch: iter_time=9.839e-04, forward_time=0.235, loss_att=40.182, acc=0.965, loss=40.182, backward_time=0.335, grad_norm=87.886, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.386e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:19:22,847 (trainer:732) INFO: 56epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.233, loss_att=40.160, acc=0.963, loss=40.160, backward_time=0.333, grad_norm=88.585, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.383e-04, train_time=3.077 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:26:15,601 (trainer:732) INFO: 56epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.236, loss_att=39.914, acc=0.965, loss=39.914, backward_time=0.335, grad_norm=91.564, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.379e-04, train_time=3.112 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:33:05,599 (trainer:732) INFO: 56epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.235, loss_att=39.245, acc=0.964, loss=39.245, backward_time=0.334, grad_norm=86.576, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.376e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:39:59,973 (trainer:732) INFO: 56epoch:train:4771-5300batch: iter_time=9.435e-04, forward_time=0.238, loss_att=40.404, acc=0.964, loss=40.404, backward_time=0.337, grad_norm=93.208, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.373e-04, train_time=3.124 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:46:52,393 (trainer:732) INFO: 56epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.237, loss_att=40.248, acc=0.964, loss=40.248, backward_time=0.335, grad_norm=86.737, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.369e-04, train_time=3.114 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 05:53:46,211 (trainer:732) INFO: 56epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.237, loss_att=39.633, acc=0.965, loss=39.633, backward_time=0.336, grad_norm=87.550, clip=100.000, loss_scale=1.000, optim_step_time=0.203, optim0_lr0=7.366e-04, train_time=3.120 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:00:36,576 (trainer:732) INFO: 56epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.235, loss_att=40.049, acc=0.964, loss=40.049, backward_time=0.336, grad_norm=97.930, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.363e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:07:27,501 (trainer:732) INFO: 56epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.236, loss_att=39.702, acc=0.964, loss=39.702, backward_time=0.335, grad_norm=83.794, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.359e-04, train_time=3.099 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:14:20,276 (trainer:732) INFO: 56epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.236, loss_att=39.750, acc=0.965, loss=39.750, backward_time=0.337, grad_norm=84.781, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.356e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:21:10,277 (trainer:732) INFO: 56epoch:train:7951-8480batch: iter_time=7.376e-04, forward_time=0.235, loss_att=40.177, acc=0.964, loss=40.177, backward_time=0.334, grad_norm=85.833, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.353e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:28:04,500 (trainer:732) INFO: 56epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.237, loss_att=41.069, acc=0.964, loss=41.069, backward_time=0.336, grad_norm=92.228, clip=100.000, loss_scale=1.000, optim_step_time=0.206, optim0_lr0=7.350e-04, train_time=3.127 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:34:56,251 (trainer:732) INFO: 56epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=40.860, acc=0.963, loss=40.860, backward_time=0.335, grad_norm=89.385, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=7.346e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:41:46,188 (trainer:732) INFO: 56epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.234, loss_att=40.119, acc=0.965, loss=40.119, backward_time=0.335, grad_norm=93.190, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.343e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:48:39,689 (trainer:732) INFO: 56epoch:train:10071-10600batch: iter_time=0.001, forward_time=0.236, loss_att=38.833, acc=0.966, loss=38.833, backward_time=0.336, grad_norm=86.199, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.340e-04, train_time=3.118 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:58:06,169 (trainer:338) INFO: 56epoch results: [train] iter_time=0.001, forward_time=0.236, loss_att=39.878, acc=0.965, loss=39.878, backward_time=0.336, grad_norm=88.917, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.371e-04, train_time=3.190, time=2 hours, 21 minutes and 17.24 seconds, total_count=594440, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.275, acc=0.951, cer=0.061, wer=0.180, loss=57.275, time=3 minutes and 57.72 seconds, total_count=672, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 8.39 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:58:16,975 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:58:16,999 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/47epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 06:58:16,999 (trainer:272) INFO: 57/60epoch started. Estimated time to finish: 9 hours, 38 minutes and 49.28 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:08:43,080 (trainer:732) INFO: 57epoch:train:1-530batch: iter_time=0.002, forward_time=0.238, loss_att=39.591, acc=0.965, loss=39.591, backward_time=0.339, grad_norm=83.755, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.336e-04, train_time=4.732 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:15:34,312 (trainer:732) INFO: 57epoch:train:531-1060batch: iter_time=7.870e-04, forward_time=0.238, loss_att=38.905, acc=0.965, loss=38.905, backward_time=0.335, grad_norm=87.883, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.333e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:22:26,627 (trainer:732) INFO: 57epoch:train:1061-1590batch: iter_time=9.180e-04, forward_time=0.236, loss_att=38.544, acc=0.966, loss=38.544, backward_time=0.336, grad_norm=94.739, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.330e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:29:18,370 (trainer:732) INFO: 57epoch:train:1591-2120batch: iter_time=7.031e-04, forward_time=0.235, loss_att=39.167, acc=0.965, loss=39.167, backward_time=0.337, grad_norm=85.377, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.327e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:36:10,700 (trainer:732) INFO: 57epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.236, loss_att=39.142, acc=0.965, loss=39.142, backward_time=0.338, grad_norm=94.142, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.323e-04, train_time=3.112 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:43:03,794 (trainer:732) INFO: 57epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.236, loss_att=40.210, acc=0.964, loss=40.210, backward_time=0.336, grad_norm=91.301, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.320e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:49:55,427 (trainer:732) INFO: 57epoch:train:3181-3710batch: iter_time=9.280e-04, forward_time=0.235, loss_att=39.260, acc=0.965, loss=39.260, backward_time=0.336, grad_norm=94.707, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.317e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 07:56:47,277 (trainer:732) INFO: 57epoch:train:3711-4240batch: iter_time=9.711e-04, forward_time=0.235, loss_att=39.860, acc=0.965, loss=39.860, backward_time=0.336, grad_norm=88.230, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.314e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:03:36,691 (trainer:732) INFO: 57epoch:train:4241-4770batch: iter_time=7.735e-04, forward_time=0.233, loss_att=40.689, acc=0.964, loss=40.689, backward_time=0.335, grad_norm=88.106, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.310e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:10:27,189 (trainer:732) INFO: 57epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.235, loss_att=39.247, acc=0.965, loss=39.247, backward_time=0.334, grad_norm=98.130, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.307e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:17:17,139 (trainer:732) INFO: 57epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.234, loss_att=40.469, acc=0.964, loss=40.469, backward_time=0.334, grad_norm=89.186, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.304e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:24:06,610 (trainer:732) INFO: 57epoch:train:5831-6360batch: iter_time=9.515e-04, forward_time=0.234, loss_att=39.960, acc=0.965, loss=39.960, backward_time=0.335, grad_norm=89.289, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.301e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:30:53,346 (trainer:732) INFO: 57epoch:train:6361-6890batch: iter_time=9.558e-04, forward_time=0.233, loss_att=39.218, acc=0.965, loss=39.218, backward_time=0.333, grad_norm=90.155, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=7.297e-04, train_time=3.071 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:37:42,190 (trainer:732) INFO: 57epoch:train:6891-7420batch: iter_time=9.188e-04, forward_time=0.234, loss_att=39.341, acc=0.965, loss=39.341, backward_time=0.335, grad_norm=93.013, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.294e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:44:31,085 (trainer:732) INFO: 57epoch:train:7421-7950batch: iter_time=7.747e-04, forward_time=0.233, loss_att=40.056, acc=0.964, loss=40.056, backward_time=0.334, grad_norm=88.418, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.291e-04, train_time=3.087 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:51:22,226 (trainer:732) INFO: 57epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.234, loss_att=39.642, acc=0.965, loss=39.642, backward_time=0.335, grad_norm=87.857, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.288e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 08:58:11,913 (trainer:732) INFO: 57epoch:train:8481-9010batch: iter_time=8.536e-04, forward_time=0.233, loss_att=38.863, acc=0.965, loss=38.863, backward_time=0.334, grad_norm=94.479, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.285e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:04:59,177 (trainer:732) INFO: 57epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.233, loss_att=39.700, acc=0.964, loss=39.700, backward_time=0.332, grad_norm=95.484, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.281e-04, train_time=3.071 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:11:50,756 (trainer:732) INFO: 57epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.235, loss_att=39.795, acc=0.965, loss=39.795, backward_time=0.335, grad_norm=88.438, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.278e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:18:37,391 (trainer:732) INFO: 57epoch:train:10071-10600batch: iter_time=8.101e-04, forward_time=0.232, loss_att=40.072, acc=0.964, loss=40.072, backward_time=0.332, grad_norm=92.064, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=7.275e-04, train_time=3.067 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:27:51,925 (trainer:338) INFO: 57epoch results: [train] iter_time=9.962e-04, forward_time=0.235, loss_att=39.587, acc=0.965, loss=39.587, backward_time=0.335, grad_norm=90.733, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.306e-04, train_time=3.177, time=2 hours, 20 minutes and 40.18 seconds, total_count=605055, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.056, acc=0.951, cer=0.062, wer=0.182, loss=58.056, time=4 minutes and 1.01 seconds, total_count=684, gpu_max_cached_mem_GB=30.221, [att_plot] time=4 minutes and 53.73 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:28:02,555 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:28:02,571 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/51epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:28:02,571 (trainer:272) INFO: 58/60epoch started. Estimated time to finish: 7 hours, 14 minutes and 25.53 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:38:30,239 (trainer:732) INFO: 58epoch:train:1-530batch: iter_time=0.004, forward_time=0.239, loss_att=39.415, acc=0.965, loss=39.415, backward_time=0.340, grad_norm=91.619, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.272e-04, train_time=4.743 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:45:23,270 (trainer:732) INFO: 58epoch:train:531-1060batch: iter_time=0.001, forward_time=0.238, loss_att=37.467, acc=0.966, loss=37.467, backward_time=0.338, grad_norm=80.774, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.269e-04, train_time=3.115 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:52:18,378 (trainer:732) INFO: 58epoch:train:1061-1590batch: iter_time=9.549e-04, forward_time=0.240, loss_att=38.826, acc=0.965, loss=38.826, backward_time=0.338, grad_norm=94.860, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.265e-04, train_time=3.134 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 09:59:10,860 (trainer:732) INFO: 58epoch:train:1591-2120batch: iter_time=9.773e-04, forward_time=0.236, loss_att=38.040, acc=0.967, loss=38.040, backward_time=0.338, grad_norm=85.416, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.262e-04, train_time=3.110 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:06:03,247 (trainer:732) INFO: 58epoch:train:2121-2650batch: iter_time=9.655e-04, forward_time=0.237, loss_att=40.403, acc=0.964, loss=40.403, backward_time=0.336, grad_norm=89.726, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=7.259e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:12:56,094 (trainer:732) INFO: 58epoch:train:2651-3180batch: iter_time=9.854e-04, forward_time=0.236, loss_att=39.354, acc=0.965, loss=39.354, backward_time=0.336, grad_norm=86.529, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.256e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:19:44,092 (trainer:732) INFO: 58epoch:train:3181-3710batch: iter_time=9.580e-04, forward_time=0.234, loss_att=39.981, acc=0.965, loss=39.981, backward_time=0.334, grad_norm=91.457, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.253e-04, train_time=3.080 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:26:32,580 (trainer:732) INFO: 58epoch:train:3711-4240batch: iter_time=8.367e-04, forward_time=0.232, loss_att=39.717, acc=0.965, loss=39.717, backward_time=0.334, grad_norm=92.380, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.250e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:33:22,171 (trainer:732) INFO: 58epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.234, loss_att=39.444, acc=0.965, loss=39.444, backward_time=0.335, grad_norm=85.988, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.246e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:40:14,081 (trainer:732) INFO: 58epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.235, loss_att=39.269, acc=0.965, loss=39.269, backward_time=0.336, grad_norm=85.866, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.243e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:47:04,590 (trainer:732) INFO: 58epoch:train:5301-5830batch: iter_time=9.021e-04, forward_time=0.234, loss_att=39.830, acc=0.965, loss=39.830, backward_time=0.335, grad_norm=85.785, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.240e-04, train_time=3.100 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 10:53:57,893 (trainer:732) INFO: 58epoch:train:5831-6360batch: iter_time=9.988e-04, forward_time=0.235, loss_att=39.517, acc=0.966, loss=39.517, backward_time=0.337, grad_norm=88.672, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.237e-04, train_time=3.116 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:00:45,711 (trainer:732) INFO: 58epoch:train:6361-6890batch: iter_time=0.001, forward_time=0.233, loss_att=38.939, acc=0.966, loss=38.939, backward_time=0.334, grad_norm=92.949, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.234e-04, train_time=3.078 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:07:37,223 (trainer:732) INFO: 58epoch:train:6891-7420batch: iter_time=9.685e-04, forward_time=0.235, loss_att=38.931, acc=0.965, loss=38.931, backward_time=0.336, grad_norm=90.332, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.231e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:14:25,638 (trainer:732) INFO: 58epoch:train:7421-7950batch: iter_time=8.817e-04, forward_time=0.233, loss_att=38.813, acc=0.965, loss=38.813, backward_time=0.334, grad_norm=90.766, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.228e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:21:13,555 (trainer:732) INFO: 58epoch:train:7951-8480batch: iter_time=9.819e-04, forward_time=0.233, loss_att=39.071, acc=0.964, loss=39.071, backward_time=0.333, grad_norm=83.178, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.225e-04, train_time=3.075 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:28:02,595 (trainer:732) INFO: 58epoch:train:8481-9010batch: iter_time=8.796e-04, forward_time=0.234, loss_att=39.980, acc=0.965, loss=39.980, backward_time=0.334, grad_norm=98.642, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.221e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:34:54,761 (trainer:732) INFO: 58epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=39.645, acc=0.965, loss=39.645, backward_time=0.335, grad_norm=92.840, clip=100.000, loss_scale=1.000, optim_step_time=0.201, optim0_lr0=7.218e-04, train_time=3.108 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:41:44,757 (trainer:732) INFO: 58epoch:train:9541-10070batch: iter_time=9.346e-04, forward_time=0.234, loss_att=39.443, acc=0.964, loss=39.443, backward_time=0.334, grad_norm=86.178, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.215e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:48:35,145 (trainer:732) INFO: 58epoch:train:10071-10600batch: iter_time=8.040e-04, forward_time=0.234, loss_att=39.870, acc=0.965, loss=39.870, backward_time=0.334, grad_norm=90.088, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.212e-04, train_time=3.095 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:57:37,173 (trainer:338) INFO: 58epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=39.302, acc=0.965, loss=39.302, backward_time=0.335, grad_norm=89.260, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.242e-04, train_time=3.181, time=2 hours, 20 minutes and 52.11 seconds, total_count=615670, gpu_max_cached_mem_GB=30.221, [valid] loss_att=58.026, acc=0.951, cer=0.063, wer=0.183, loss=58.026, time=3 minutes and 34.3 seconds, total_count=696, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 8.19 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:57:47,848 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:57:47,866 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/43epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 11:57:47,866 (trainer:272) INFO: 59/60epoch started. Estimated time to finish: 4 hours, 49 minutes and 48.89 seconds +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:08:17,421 (trainer:732) INFO: 59epoch:train:1-530batch: iter_time=0.004, forward_time=0.241, loss_att=37.908, acc=0.966, loss=37.908, backward_time=0.338, grad_norm=88.866, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.209e-04, train_time=4.758 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:15:08,014 (trainer:732) INFO: 59epoch:train:531-1060batch: iter_time=9.740e-04, forward_time=0.236, loss_att=38.211, acc=0.966, loss=38.211, backward_time=0.336, grad_norm=86.222, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.206e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:22:00,064 (trainer:732) INFO: 59epoch:train:1061-1590batch: iter_time=0.001, forward_time=0.236, loss_att=38.382, acc=0.966, loss=38.382, backward_time=0.336, grad_norm=91.078, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.203e-04, train_time=3.111 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:28:52,308 (trainer:732) INFO: 59epoch:train:1591-2120batch: iter_time=8.819e-04, forward_time=0.236, loss_att=38.743, acc=0.965, loss=38.743, backward_time=0.337, grad_norm=87.604, clip=100.000, loss_scale=1.000, optim_step_time=0.193, optim0_lr0=7.200e-04, train_time=3.109 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:35:40,542 (trainer:732) INFO: 59epoch:train:2121-2650batch: iter_time=9.581e-04, forward_time=0.234, loss_att=38.032, acc=0.965, loss=38.032, backward_time=0.333, grad_norm=91.379, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.196e-04, train_time=3.081 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:42:32,212 (trainer:732) INFO: 59epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.235, loss_att=39.728, acc=0.965, loss=39.728, backward_time=0.337, grad_norm=86.267, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.193e-04, train_time=3.105 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:49:23,332 (trainer:732) INFO: 59epoch:train:3181-3710batch: iter_time=0.001, forward_time=0.235, loss_att=37.897, acc=0.966, loss=37.897, backward_time=0.336, grad_norm=85.262, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.190e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 12:56:17,259 (trainer:732) INFO: 59epoch:train:3711-4240batch: iter_time=0.001, forward_time=0.236, loss_att=40.020, acc=0.965, loss=40.020, backward_time=0.337, grad_norm=84.764, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.187e-04, train_time=3.122 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:03:11,868 (trainer:732) INFO: 59epoch:train:4241-4770batch: iter_time=0.001, forward_time=0.236, loss_att=38.830, acc=0.965, loss=38.830, backward_time=0.337, grad_norm=89.594, clip=100.000, loss_scale=1.000, optim_step_time=0.205, optim0_lr0=7.184e-04, train_time=3.131 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:10:01,607 (trainer:732) INFO: 59epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.234, loss_att=38.665, acc=0.965, loss=38.665, backward_time=0.334, grad_norm=89.033, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.181e-04, train_time=3.089 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:16:53,126 (trainer:732) INFO: 59epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.235, loss_att=39.311, acc=0.965, loss=39.311, backward_time=0.334, grad_norm=93.494, clip=100.000, loss_scale=1.000, optim_step_time=0.199, optim0_lr0=7.178e-04, train_time=3.106 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:23:42,397 (trainer:732) INFO: 59epoch:train:5831-6360batch: iter_time=8.430e-04, forward_time=0.233, loss_att=38.682, acc=0.965, loss=38.682, backward_time=0.335, grad_norm=90.961, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.175e-04, train_time=3.086 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:30:32,690 (trainer:732) INFO: 59epoch:train:6361-6890batch: iter_time=7.974e-04, forward_time=0.234, loss_att=39.677, acc=0.965, loss=39.677, backward_time=0.336, grad_norm=85.184, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.172e-04, train_time=3.097 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:37:23,028 (trainer:732) INFO: 59epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.234, loss_att=39.343, acc=0.965, loss=39.343, backward_time=0.334, grad_norm=87.097, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.169e-04, train_time=3.094 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:44:15,437 (trainer:732) INFO: 59epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.234, loss_att=39.067, acc=0.966, loss=39.067, backward_time=0.336, grad_norm=89.309, clip=100.000, loss_scale=1.000, optim_step_time=0.202, optim0_lr0=7.166e-04, train_time=3.113 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:51:04,842 (trainer:732) INFO: 59epoch:train:7951-8480batch: iter_time=0.001, forward_time=0.233, loss_att=39.344, acc=0.966, loss=39.344, backward_time=0.334, grad_norm=90.598, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.163e-04, train_time=3.088 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 13:57:54,490 (trainer:732) INFO: 59epoch:train:8481-9010batch: iter_time=9.485e-04, forward_time=0.233, loss_att=39.637, acc=0.965, loss=39.637, backward_time=0.335, grad_norm=87.681, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.160e-04, train_time=3.093 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:04:44,321 (trainer:732) INFO: 59epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.235, loss_att=39.410, acc=0.965, loss=39.410, backward_time=0.334, grad_norm=85.887, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.157e-04, train_time=3.090 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992217:2992346 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 147) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 148) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992216:2992347 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 149) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:11:33,521 (trainer:732) INFO: 59epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.234, loss_att=40.230, acc=0.964, loss=40.230, backward_time=0.335, grad_norm=89.480, clip=100.000, loss_scale=1.000, optim_step_time=0.188, optim0_lr0=7.154e-04, train_time=3.090 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 158) + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992215:2992348 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 158) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:18:22,344 (trainer:732) INFO: 59epoch:train:10071-10600batch: iter_time=8.261e-04, forward_time=0.232, loss_att=39.726, acc=0.964, loss=39.726, backward_time=0.333, grad_norm=90.657, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.151e-04, train_time=3.083 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:27:39,369 (trainer:338) INFO: 59epoch results: [train] iter_time=0.001, forward_time=0.235, loss_att=39.041, acc=0.965, loss=39.041, backward_time=0.335, grad_norm=88.531, clip=100.000, loss_scale=1.000, optim_step_time=0.194, optim0_lr0=7.180e-04, train_time=3.182, time=2 hours, 20 minutes and 54.99 seconds, total_count=626285, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.414, acc=0.952, cer=0.062, wer=0.181, loss=57.414, time=3 minutes and 49.77 seconds, total_count=708, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 6.73 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:27:49,671 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:27:49,687 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/48epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:27:49,687 (trainer:272) INFO: 60/60epoch started. Estimated time to finish: 2 hours, 25 minutes and 0.47 seconds + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<45238> +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:2992218:2992349 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 146) +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:38:15,340 (trainer:732) INFO: 60epoch:train:1-530batch: iter_time=0.005, forward_time=0.238, loss_att=37.238, acc=0.967, loss=37.238, backward_time=0.337, grad_norm=80.459, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.148e-04, train_time=4.729 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:45:11,988 (trainer:732) INFO: 60epoch:train:531-1060batch: iter_time=0.001, forward_time=0.239, loss_att=37.847, acc=0.966, loss=37.847, backward_time=0.339, grad_norm=89.335, clip=100.000, loss_scale=1.000, optim_step_time=0.205, optim0_lr0=7.145e-04, train_time=3.141 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:52:04,904 (trainer:732) INFO: 60epoch:train:1061-1590batch: iter_time=9.804e-04, forward_time=0.238, loss_att=38.510, acc=0.966, loss=38.510, backward_time=0.338, grad_norm=87.686, clip=100.000, loss_scale=1.000, optim_step_time=0.189, optim0_lr0=7.142e-04, train_time=3.117 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 14:58:57,130 (trainer:732) INFO: 60epoch:train:1591-2120batch: iter_time=8.069e-04, forward_time=0.237, loss_att=38.974, acc=0.965, loss=38.974, backward_time=0.336, grad_norm=87.363, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.138e-04, train_time=3.107 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:05:48,074 (trainer:732) INFO: 60epoch:train:2121-2650batch: iter_time=0.001, forward_time=0.236, loss_att=39.528, acc=0.965, loss=39.528, backward_time=0.335, grad_norm=86.063, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.135e-04, train_time=3.102 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:12:39,492 (trainer:732) INFO: 60epoch:train:2651-3180batch: iter_time=0.001, forward_time=0.235, loss_att=38.329, acc=0.966, loss=38.329, backward_time=0.336, grad_norm=90.771, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.132e-04, train_time=3.103 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:19:27,703 (trainer:732) INFO: 60epoch:train:3181-3710batch: iter_time=9.197e-04, forward_time=0.233, loss_att=39.730, acc=0.965, loss=39.730, backward_time=0.334, grad_norm=92.029, clip=100.000, loss_scale=1.000, optim_step_time=0.187, optim0_lr0=7.129e-04, train_time=3.082 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:26:15,812 (trainer:732) INFO: 60epoch:train:3711-4240batch: iter_time=8.292e-04, forward_time=0.233, loss_att=39.368, acc=0.965, loss=39.368, backward_time=0.335, grad_norm=95.406, clip=100.000, loss_scale=1.000, optim_step_time=0.184, optim0_lr0=7.126e-04, train_time=3.077 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:33:05,268 (trainer:732) INFO: 60epoch:train:4241-4770batch: iter_time=8.211e-04, forward_time=0.233, loss_att=38.944, acc=0.966, loss=38.944, backward_time=0.335, grad_norm=90.344, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.123e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:39:55,373 (trainer:732) INFO: 60epoch:train:4771-5300batch: iter_time=0.001, forward_time=0.233, loss_att=37.171, acc=0.966, loss=37.171, backward_time=0.334, grad_norm=91.192, clip=100.000, loss_scale=1.000, optim_step_time=0.198, optim0_lr0=7.120e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:46:42,657 (trainer:732) INFO: 60epoch:train:5301-5830batch: iter_time=0.001, forward_time=0.232, loss_att=39.015, acc=0.965, loss=39.015, backward_time=0.334, grad_norm=86.277, clip=100.000, loss_scale=1.000, optim_step_time=0.185, optim0_lr0=7.117e-04, train_time=3.075 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 15:53:33,277 (trainer:732) INFO: 60epoch:train:5831-6360batch: iter_time=0.001, forward_time=0.233, loss_att=39.339, acc=0.965, loss=39.339, backward_time=0.335, grad_norm=86.100, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.115e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:00:21,848 (trainer:732) INFO: 60epoch:train:6361-6890batch: iter_time=7.851e-04, forward_time=0.232, loss_att=39.690, acc=0.965, loss=39.690, backward_time=0.333, grad_norm=89.067, clip=100.000, loss_scale=1.000, optim_step_time=0.196, optim0_lr0=7.112e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:07:11,612 (trainer:732) INFO: 60epoch:train:6891-7420batch: iter_time=0.001, forward_time=0.234, loss_att=38.748, acc=0.965, loss=38.748, backward_time=0.335, grad_norm=90.303, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.109e-04, train_time=3.090 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:14:01,147 (trainer:732) INFO: 60epoch:train:7421-7950batch: iter_time=0.001, forward_time=0.233, loss_att=39.206, acc=0.966, loss=39.206, backward_time=0.336, grad_norm=80.733, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.106e-04, train_time=3.092 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:20:49,412 (trainer:732) INFO: 60epoch:train:7951-8480batch: iter_time=8.037e-04, forward_time=0.231, loss_att=38.531, acc=0.966, loss=38.531, backward_time=0.334, grad_norm=88.147, clip=100.000, loss_scale=1.000, optim_step_time=0.186, optim0_lr0=7.103e-04, train_time=3.078 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:27:38,826 (trainer:732) INFO: 60epoch:train:8481-9010batch: iter_time=0.001, forward_time=0.233, loss_att=38.685, acc=0.966, loss=38.685, backward_time=0.336, grad_norm=90.501, clip=100.000, loss_scale=1.000, optim_step_time=0.195, optim0_lr0=7.100e-04, train_time=3.091 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:34:29,264 (trainer:732) INFO: 60epoch:train:9011-9540batch: iter_time=0.001, forward_time=0.232, loss_att=39.140, acc=0.965, loss=39.140, backward_time=0.336, grad_norm=85.469, clip=100.000, loss_scale=1.000, optim_step_time=0.197, optim0_lr0=7.097e-04, train_time=3.096 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:41:17,840 (trainer:732) INFO: 60epoch:train:9541-10070batch: iter_time=0.001, forward_time=0.233, loss_att=38.186, acc=0.966, loss=38.186, backward_time=0.333, grad_norm=84.283, clip=100.000, loss_scale=1.000, optim_step_time=0.191, optim0_lr0=7.094e-04, train_time=3.084 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:48:05,375 (trainer:732) INFO: 60epoch:train:10071-10600batch: iter_time=8.776e-04, forward_time=0.231, loss_att=39.163, acc=0.965, loss=39.163, backward_time=0.333, grad_norm=91.709, clip=100.000, loss_scale=1.000, optim_step_time=0.190, optim0_lr0=7.091e-04, train_time=3.073 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:22,641 (trainer:338) INFO: 60epoch results: [train] iter_time=0.001, forward_time=0.234, loss_att=38.760, acc=0.965, loss=38.760, backward_time=0.335, grad_norm=88.137, clip=100.000, loss_scale=1.000, optim_step_time=0.192, optim0_lr0=7.119e-04, train_time=3.175, time=2 hours, 20 minutes and 35.75 seconds, total_count=636900, gpu_max_cached_mem_GB=30.221, [valid] loss_att=57.717, acc=0.951, cer=0.062, wer=0.181, loss=57.717, time=3 minutes and 54.06 seconds, total_count=720, gpu_max_cached_mem_GB=30.221, [att_plot] time=5 minutes and 3.13 seconds, total_count=0, gpu_max_cached_mem_GB=30.221 +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:28,835 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:28,850 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/52epoch.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:28,850 (trainer:458) INFO: The training was finished at 60 epochs +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:28,945 (average_nbest_models:69) INFO: Averaging 10best models: criterion="valid.acc": exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,275 (average_nbest_models:96) INFO: Accumulating encoder.encoders.0.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,278 (average_nbest_models:96) INFO: Accumulating encoder.encoders.1.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,290 (average_nbest_models:96) INFO: Accumulating encoder.encoders.2.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,315 (average_nbest_models:96) INFO: Accumulating encoder.encoders.3.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,328 (average_nbest_models:96) INFO: Accumulating encoder.encoders.4.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,335 (average_nbest_models:96) INFO: Accumulating encoder.encoders.5.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,348 (average_nbest_models:96) INFO: Accumulating encoder.encoders.6.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,360 (average_nbest_models:96) INFO: Accumulating encoder.encoders.7.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,369 (average_nbest_models:96) INFO: Accumulating encoder.encoders.8.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,382 (average_nbest_models:96) INFO: Accumulating encoder.encoders.9.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,399 (average_nbest_models:96) INFO: Accumulating encoder.encoders.10.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-9-0208143539-7dbf569d4f-r7nrb:0/4] 2023-11-26 16:57:42,402 (average_nbest_models:96) INFO: Accumulating encoder.encoders.11.conv_module.norm.num_batches_tracked instead of averaging +# Accounting: time=452772 threads=1 +# Ended (code 0) at Sun Nov 26 16:57:47 CST 2023, elapsed time 452772 seconds diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth new file mode 100644 index 0000000000000000000000000000000000000000..60bb58e03a2fad6df20efe000afe5e16a853edab --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6942cd8e2a7d9015a74726b4164bd69324059cb230e97ebccf434f6b1afdee4c +size 172358249 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth new file mode 100644 index 0000000000000000000000000000000000000000..60bb58e03a2fad6df20efe000afe5e16a853edab --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6942cd8e2a7d9015a74726b4164bd69324059cb230e97ebccf434f6b1afdee4c +size 172358249 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth new file mode 100644 index 0000000000000000000000000000000000000000..147771fda88d4e5473bfe6ea2ada0b5771f2d8a1 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63fced0fdc8817cd4f376fe8e33ffc3b07a8615ce2151f6b5bd6be0990bb75ff +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc52644735a259dac2ce20c87bc1dc25ae5ba042 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a83dbcc31cbe211bf57d554185f00fdefa382c828fadc33a1202749a7cafbc +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..bc623114878ec57537a5255609f646968766a1c3 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8359149a4f68dfaf1fb54be64f57b199f0c2a2ce86c123ca9479a8f29854ab59 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/53epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/53epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..719aabbea9b244724eac988a48a08069e4620a56 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/53epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f7928e027e6aaca6f547f40d08d36ed21ec877c81f13b6e8432ffd119a4f635 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..19c4ab3e0efef6e15ea1d89b0c6f2692f2862ce9 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2917e31143d7136e849de1dee47a99942384a481fe11ed989b17d7faf676170c +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/55epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/55epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3c7fbba3be062b8e4cf5b15d9b1ad8ba0be3afb --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/55epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:035c566070194c490d66447e070ae13154064c7d387b0e57a13693deabadd34d +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..206175b3485835b48457449908401893c4039059 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ca103e0d53a77d3de58e2824377a80b0eaf05e94168db12568bb9d244d482b3 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..4581ca64ff25ba58415b834725f6beaaaefd5f46 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439bb2cb8c0a92c4fee9a6bf3dddaa44208168dd993affb7fb220aa59004cafd +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/58epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/58epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..29dc35f256645ab716441c106f023ceda3550054 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/58epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd7066a0fa0238c0e77d55015b918a3a10845bfb5903b302e6b7bdb0373439c +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/59epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/59epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..45bb5cb25c2775465b0de63059f6d247a64d4291 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/59epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500e10b3d0c5e3d7d0ef3215980233b0e916b2df96cd99cf0633bd6e39dc14a2 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b57f292eef5cc25c79d59e927cc0d2c9a1234fe --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff948ccc68c37ec9d32b89b18ead7d3fd3a8148f35e984996280b1556dd5ea6 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..0de150658cf78314fdb9d4273e7b1917a07104bf --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md @@ -0,0 +1,51 @@ + +# RESULTS +## Environments +- date: `Tue Mar 5 08:11:17 CST 2024` +- python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]` +- espnet version: `espnet 202308` +- pytorch version: `pytorch 1.12.1+cu116` +- Git hash: `884659f9ee95374811015381c976fa3b4f6e01db` + - Commit date: `Thu Nov 23 00:23:29 2023 +0800` + +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|226216|76.4|10.6|13.0|9.2|32.8|99.4| +|decode_sot_asr_model_valid.acc.best/dev_2spk_kaldi_fmt|1606|135101|79.0|9.3|11.7|7.4|28.4|98.9| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|209679|63.4|20.0|16.6|9.8|46.4|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|200029|51.8|26.8|21.4|8.4|56.5|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|301042|76.8|10.5|12.7|8.3|31.5|99.4| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|212871|64.2|18.7|17.1|10.2|46.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|185394|53.2|25.6|21.2|8.5|55.3|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|336490|75.4|12.5|12.0|8.7|33.3|99.8| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|266074|60.6|23.2|16.1|10.8|50.2|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|259138|48.8|30.3|21.0|8.8|60.1|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|178761|80.3|8.9|10.8|6.1|25.8|98.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|205496|78.5|11.6|10.0|6.4|28.0|99.7| +|decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|3000|0.0|100.0|0.0|3110.6|3210.6|100.0| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|1230801|82.6|6.2|11.2|7.4|24.7|99.4| +|decode_sot_asr_model_valid.acc.best/dev_2spk_kaldi_fmt|1606|735694|84.5|5.1|10.5|5.9|21.4|98.9| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|1140428|73.8|10.2|15.9|8.5|34.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|1087409|65.1|12.8|22.1|7.8|42.7|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|1550429|83.0|6.2|10.8|6.6|23.6|99.4| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|1084475|74.0|10.3|15.7|8.6|34.6|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|938467|65.9|12.8|21.3|8.1|42.2|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|1742136|82.5|7.0|10.5|7.0|24.5|99.8| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|1381987|72.5|11.8|15.7|9.4|36.8|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|1346646|63.5|14.1|22.3|8.8|45.2|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|921344|85.6|4.6|9.8|5.0|19.4|98.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|1064868|85.0|5.7|9.3|5.4|20.3|99.7| +|decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|143026|16.3|83.6|0.2|299.2|382.9|100.0| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ff01f279d9890e07f770413abe3307fb7b915e1 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b1894d6081a244c82bf309483fca4c695d1fa86de21030b1c61f1f18460be3 +size 516972574 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..614bb1d5b80a69faa97d436671a90f16c28aaa7c --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml @@ -0,0 +1,227 @@ +config: conf/tuning/train_sot_asr_conformer_medium.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr +ngpu: 1 +seed: 0 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: 4 +dist_rank: 0 +local_rank: 0 +dist_master_addr: localhost +dist_master_port: 49853 +dist_launcher: null +multiprocessing_distributed: true +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: false +write_collected_feats: false +max_epoch: 60 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: +- /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 16000000 +valid_batch_bins: null +train_shape_file: +- exp/asr_stats_raw_en_char/train/speech_shape +- exp/asr_stats_raw_en_char/train/text_shape.char +valid_shape_file: +- exp/asr_stats_raw_en_char/valid/speech_shape +- exp/asr_stats_raw_en_char/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/medium_w_whamr_sp/wav.scp + - speech + - kaldi_ark +- - dump/raw/medium_w_whamr_sp/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/raw/cv_mix_clean_reverb_max_16k/wav.scp + - speech + - kaldi_ark +- - dump/raw/cv_mix_clean_reverb_max_16k/text + - text + - text +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 20000 +token_list: +- +- +- +- +- E +- T +- A +- O +- N +- I +- H +- S +- R +- D +- L +- U +- M +- C +- W +- F +- G +- Y +- P +- B +- V +- K +- '''' +- X +- J +- Q +- Z +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true +joint_net_conf: null +use_preprocessor: true +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + fs: 16k +specaug: null +specaug_conf: {} +normalize: global_mvn +normalize_conf: + stats_file: exp/asr_stats_raw_en_char/train/feats_stats.npz +model: espnet +model_conf: + ctc_weight: 0.0 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: conformer +encoder_conf: + output_size: 256 + attention_heads: 4 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 +preprocessor: multi +preprocessor_conf: + speaker_change_symbol: + - +required: +- output_dir +- token_list +version: '202308' +distributed: true diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..1679e9e28dad879e6b057c786e6fc26203f9492d Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..9fa69deabfa2a5e3bdd1f7ba1dccf246a9e639f2 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..ad1beb7a02c6457dab4b3bd5d8792afb2e7916d8 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..8dcd44eeb8e61cefd50db5514c608663bad42973 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..8fa1c21ff17cdc71d112b70c360fb8c4753fb116 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..03d29aeea8e9d54076f517eba9e4e3923268998f Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..ef62b74d96a50b0a9b741fbde299de2de57dab85 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..d1cc537e17e7fd75369554e611c39f636f4e0770 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..65aa56985e4f091c712e0342b8688a3b651e0b02 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..6f9e11af68ae3b5f349bede52a7c4dbfbaf22d65 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..460bfdb13d9e33389b68b962751b15221d3e531f Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..58f7bf6a835d94774bdcdf4d4208f8d53bac2669 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..735075dd13883d90e8546dd668df0dceb03b8c49 Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..87cdf3861f2c735ef9e33bafbd59baa9c4e4035a Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..0194265613c2f88008c1cd074a2100130e5f0b8a Binary files /dev/null and b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png differ diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b57f292eef5cc25c79d59e927cc0d2c9a1234fe --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff948ccc68c37ec9d32b89b18ead7d3fd3a8148f35e984996280b1556dd5ea6 +size 172367337 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..3317cc65ba66330f944d748e7e0fa7ea12bdea70 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh @@ -0,0 +1 @@ +./asr.sh --lang en --audio_format flac.ark --stage 6 --stop_stage 15 --speed_perturb_factors '0.9 1.0 1.1' --feats_type raw --token_type char --sot_asr true --max_wav_duration 50 --speed_perturb_factors '' --feats_normalize global_mvn --use_lm false --pretrained_model /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --asr_config conf/tuning/train_sot_asr_conformer_medium.yaml --lm_config conf/tuning/train_lm_transformer.yaml --inference_config conf/tuning/decode_sot.yaml --train_set medium_w_whamr --valid_set cv_mix_clean_reverb_max_16k --test_sets tt_mix_clean_reverb_max_16k --ngpu 4 --asr_tag train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --lm_train_text data/local/other_text/text --bpe_train_text data/medium_w_whamr/text --stage 11 "$@"; exit $? diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.0 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.0 new file mode 100644 index 0000000000000000000000000000000000000000..f14359329ec3038679e3592bb5a8d434e316b92b --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de086698717551f631cf50d5e307fdffbb643173035c4843b0eac082772e9cd +size 867832140 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.1 b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.1 new file mode 100644 index 0000000000000000000000000000000000000000..b7bbadb602e90d15eecb8d54d8d1b75280476029 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708505037.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1849195.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73d7013f718df743dd3aad5dab804f8b4c82536e49347955d463850336d4d504 +size 17002 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log new file mode 100644 index 0000000000000000000000000000000000000000..109189899f67dcf9347e774050714fee6a651bb4 --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log @@ -0,0 +1,2499 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/medium_w_whamr_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/medium_w_whamr_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Wed Feb 21 16:42:04 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_medium.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/medium_w_whamr_sp/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/medium_w_whamr_sp/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:49853 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:49853 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:49853 (errno: 99 - Cannot assign requested address). +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:22,583 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:22,583 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:22,632 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,378 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,398 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,399 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,399 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,403 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:30,436 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:35,557 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:49,867 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/medium_w_whamr_sp/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/medium_w_whamr_sp/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:49,868 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=11272, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:49,872 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=11272, mean=55.6, min=8, max=218 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,075 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,151 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,151 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=56, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,152 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=56, mean=89.3, min=4, max=153 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,172 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,218 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,218 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:50,219 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849196 [1] NCCL INFO Using network Socket + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849197 [2] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849198 [3] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Setting affinity for GPU 1 to 0fffe0,0001fffe +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Setting affinity for GPU 0 to 0fffe0,0001fffe +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Channel 00 : 3[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Channel 00 : 1[3e000] -> 2[b4000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Channel 01 : 3[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Channel 01 : 1[3e000] -> 2[b4000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Channel 00 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Channel 01 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Channel 00 : 2[b4000] -> 1[3e000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Channel 01 : 2[b4000] -> 1[3e000] via direct shared memory +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Channel 00 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Channel 01 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Channel 00 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Channel 01 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Channel 00 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Channel 01 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849249 [1] NCCL INFO comm 0x7efeb8002f70 rank 1 nranks 4 cudaDev 1 busId 3e000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849251 [3] NCCL INFO comm 0x7f14cc002f70 rank 3 nranks 4 cudaDev 3 busId b5000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849248 [0] NCCL INFO comm 0x7fcc70002f70 rank 0 nranks 4 cudaDev 0 busId 3d000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849250 [2] NCCL INFO comm 0x7f34dc002f70 rank 2 nranks 4 cudaDev 2 busId b4000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849195 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:43:57,412 (trainer:284) INFO: 1/60epoch started +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:50:28,760 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 16:57:16,488 (trainer:732) INFO: 1epoch:train:1-563batch: iter_time=0.003, forward_time=0.238, loss_att=791.592, acc=0.441, loss=791.592, backward_time=0.309, grad_norm=549.979, clip=100.000, loss_scale=1.000, optim_step_time=0.103, optim0_lr0=7.150e-06, train_time=5.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:04:10,994 (trainer:732) INFO: 1epoch:train:564-1126batch: iter_time=6.938e-04, forward_time=0.228, loss_att=540.286, acc=0.540, loss=540.286, backward_time=0.309, grad_norm=160.811, clip=100.000, loss_scale=1.000, optim_step_time=0.112, optim0_lr0=2.120e-05, train_time=2.944 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:11:03,055 (trainer:732) INFO: 1epoch:train:1127-1689batch: iter_time=5.959e-04, forward_time=0.224, loss_att=479.138, acc=0.581, loss=479.138, backward_time=0.307, grad_norm=106.660, clip=100.000, loss_scale=1.000, optim_step_time=0.106, optim0_lr0=3.530e-05, train_time=2.928 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:17:41,133 (trainer:732) INFO: 1epoch:train:1690-2252batch: iter_time=4.702e-04, forward_time=0.215, loss_att=456.176, acc=0.600, loss=456.176, backward_time=0.306, grad_norm=90.214, clip=100.000, loss_scale=1.000, optim_step_time=0.093, optim0_lr0=4.940e-05, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:24:31,730 (trainer:732) INFO: 1epoch:train:2253-2815batch: iter_time=6.117e-04, forward_time=0.224, loss_att=436.773, acc=0.616, loss=436.773, backward_time=0.307, grad_norm=77.182, clip=100.000, loss_scale=1.000, optim_step_time=0.109, optim0_lr0=6.345e-05, train_time=2.919 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:31:18,655 (trainer:732) INFO: 1epoch:train:2816-3378batch: iter_time=6.263e-04, forward_time=0.220, loss_att=416.694, acc=0.628, loss=416.694, backward_time=0.304, grad_norm=70.787, clip=100.000, loss_scale=1.000, optim_step_time=0.107, optim0_lr0=7.750e-05, train_time=2.890 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:37:51,091 (trainer:732) INFO: 1epoch:train:3379-3941batch: iter_time=4.126e-04, forward_time=0.210, loss_att=401.442, acc=0.637, loss=401.442, backward_time=0.301, grad_norm=68.036, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=9.160e-05, train_time=2.787 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:44:07,147 (trainer:732) INFO: 1epoch:train:3942-4504batch: iter_time=2.893e-04, forward_time=0.202, loss_att=401.264, acc=0.648, loss=401.264, backward_time=0.297, grad_norm=70.497, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.057e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:50:27,356 (trainer:732) INFO: 1epoch:train:4505-5067batch: iter_time=2.870e-04, forward_time=0.206, loss_att=396.545, acc=0.657, loss=396.545, backward_time=0.301, grad_norm=77.286, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=1.198e-04, train_time=2.703 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 17:56:46,544 (trainer:732) INFO: 1epoch:train:5068-5630batch: iter_time=2.865e-04, forward_time=0.204, loss_att=377.828, acc=0.662, loss=377.828, backward_time=0.301, grad_norm=76.534, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.338e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:03:02,862 (trainer:732) INFO: 1epoch:train:5631-6193batch: iter_time=2.768e-04, forward_time=0.203, loss_att=370.888, acc=0.666, loss=370.888, backward_time=0.299, grad_norm=75.287, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.479e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:09:21,334 (trainer:732) INFO: 1epoch:train:6194-6756batch: iter_time=2.874e-04, forward_time=0.204, loss_att=365.634, acc=0.676, loss=365.634, backward_time=0.300, grad_norm=81.802, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.620e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:15:41,618 (trainer:732) INFO: 1epoch:train:6757-7319batch: iter_time=2.802e-04, forward_time=0.203, loss_att=362.217, acc=0.680, loss=362.217, backward_time=0.300, grad_norm=79.395, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.760e-04, train_time=2.703 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:21:58,417 (trainer:732) INFO: 1epoch:train:7320-7882batch: iter_time=2.816e-04, forward_time=0.202, loss_att=343.613, acc=0.687, loss=343.613, backward_time=0.298, grad_norm=80.793, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.901e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:28:17,301 (trainer:732) INFO: 1epoch:train:7883-8445batch: iter_time=2.819e-04, forward_time=0.203, loss_att=344.291, acc=0.692, loss=344.291, backward_time=0.300, grad_norm=83.562, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.042e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:34:32,131 (trainer:732) INFO: 1epoch:train:8446-9008batch: iter_time=2.754e-04, forward_time=0.202, loss_att=331.425, acc=0.696, loss=331.425, backward_time=0.297, grad_norm=75.797, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=2.183e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:40:51,304 (trainer:732) INFO: 1epoch:train:9009-9571batch: iter_time=2.805e-04, forward_time=0.204, loss_att=340.394, acc=0.702, loss=340.394, backward_time=0.301, grad_norm=89.742, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.323e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:47:07,975 (trainer:732) INFO: 1epoch:train:9572-10134batch: iter_time=2.839e-04, forward_time=0.203, loss_att=324.090, acc=0.705, loss=324.090, backward_time=0.299, grad_norm=84.256, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.464e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:53:23,986 (trainer:732) INFO: 1epoch:train:10135-10697batch: iter_time=2.910e-04, forward_time=0.202, loss_att=313.781, acc=0.710, loss=313.781, backward_time=0.298, grad_norm=86.784, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.605e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 18:59:41,783 (trainer:732) INFO: 1epoch:train:10698-11260batch: iter_time=2.833e-04, forward_time=0.203, loss_att=315.009, acc=0.715, loss=315.009, backward_time=0.300, grad_norm=85.707, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.746e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:08:29,851 (trainer:338) INFO: 1epoch results: [train] iter_time=5.071e-04, forward_time=0.210, loss_att=404.586, acc=0.647, loss=404.586, backward_time=0.302, grad_norm=108.414, clip=100.000, loss_scale=1.000, optim_step_time=0.077, optim0_lr0=1.410e-04, train_time=2.893, time=2 hours, 16 minutes and 6.67 seconds, total_count=11272, gpu_max_cached_mem_GB=30.053, [valid] loss_att=142.173, acc=0.723, cer=0.328, wer=0.683, loss=142.173, time=4 minutes and 52.37 seconds, total_count=56, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 33.32 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:08:33,637 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:08:33,638 (trainer:272) INFO: 2/60epoch started. Estimated time to finish: 5 days, 22 hours and 11 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:17:49,048 (trainer:732) INFO: 2epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=315.672, acc=0.722, loss=315.672, backward_time=0.299, grad_norm=94.241, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=2.889e-04, train_time=3.954 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:24:06,198 (trainer:732) INFO: 2epoch:train:564-1126batch: iter_time=2.919e-04, forward_time=0.203, loss_att=299.139, acc=0.722, loss=299.139, backward_time=0.298, grad_norm=88.689, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=3.030e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:30:24,220 (trainer:732) INFO: 2epoch:train:1127-1689batch: iter_time=2.891e-04, forward_time=0.204, loss_att=305.766, acc=0.726, loss=305.766, backward_time=0.299, grad_norm=90.492, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=3.171e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:36:43,024 (trainer:732) INFO: 2epoch:train:1690-2252batch: iter_time=2.856e-04, forward_time=0.204, loss_att=296.610, acc=0.733, loss=296.610, backward_time=0.299, grad_norm=89.158, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=3.312e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:43:00,866 (trainer:732) INFO: 2epoch:train:2253-2815batch: iter_time=2.897e-04, forward_time=0.204, loss_att=293.828, acc=0.733, loss=293.828, backward_time=0.299, grad_norm=88.453, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=3.452e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:49:19,332 (trainer:732) INFO: 2epoch:train:2816-3378batch: iter_time=2.810e-04, forward_time=0.203, loss_att=296.474, acc=0.737, loss=296.474, backward_time=0.299, grad_norm=99.350, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=3.593e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 19:55:40,893 (trainer:732) INFO: 2epoch:train:3379-3941batch: iter_time=3.017e-04, forward_time=0.206, loss_att=288.886, acc=0.741, loss=288.886, backward_time=0.301, grad_norm=94.228, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=3.734e-04, train_time=2.710 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:02:04,132 (trainer:732) INFO: 2epoch:train:3942-4504batch: iter_time=3.381e-04, forward_time=0.206, loss_att=282.128, acc=0.743, loss=282.128, backward_time=0.302, grad_norm=91.484, clip=100.000, loss_scale=1.000, optim_step_time=0.071, optim0_lr0=3.875e-04, train_time=2.721 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:08:24,850 (trainer:732) INFO: 2epoch:train:4505-5067batch: iter_time=3.166e-04, forward_time=0.204, loss_att=279.417, acc=0.746, loss=279.417, backward_time=0.298, grad_norm=93.604, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.015e-04, train_time=2.706 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:14:43,380 (trainer:732) INFO: 2epoch:train:5068-5630batch: iter_time=3.227e-04, forward_time=0.205, loss_att=277.510, acc=0.751, loss=277.510, backward_time=0.299, grad_norm=87.575, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.156e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:21:03,659 (trainer:732) INFO: 2epoch:train:5631-6193batch: iter_time=3.183e-04, forward_time=0.206, loss_att=276.290, acc=0.754, loss=276.290, backward_time=0.300, grad_norm=94.623, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.297e-04, train_time=2.701 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:27:21,061 (trainer:732) INFO: 2epoch:train:6194-6756batch: iter_time=3.231e-04, forward_time=0.204, loss_att=264.309, acc=0.755, loss=264.309, backward_time=0.298, grad_norm=95.379, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.438e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:33:39,525 (trainer:732) INFO: 2epoch:train:6757-7319batch: iter_time=3.214e-04, forward_time=0.205, loss_att=270.695, acc=0.757, loss=270.695, backward_time=0.298, grad_norm=92.749, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.578e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:39:56,996 (trainer:732) INFO: 2epoch:train:7320-7882batch: iter_time=3.183e-04, forward_time=0.204, loss_att=259.690, acc=0.760, loss=259.690, backward_time=0.298, grad_norm=90.407, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.719e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:46:16,505 (trainer:732) INFO: 2epoch:train:7883-8445batch: iter_time=3.154e-04, forward_time=0.205, loss_att=263.049, acc=0.764, loss=263.049, backward_time=0.300, grad_norm=93.826, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=4.860e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:52:33,231 (trainer:732) INFO: 2epoch:train:8446-9008batch: iter_time=3.097e-04, forward_time=0.203, loss_att=258.240, acc=0.766, loss=258.240, backward_time=0.297, grad_norm=92.329, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=5.001e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 20:58:49,796 (trainer:732) INFO: 2epoch:train:9009-9571batch: iter_time=3.287e-04, forward_time=0.203, loss_att=255.980, acc=0.766, loss=255.980, backward_time=0.297, grad_norm=95.725, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.141e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:05:08,272 (trainer:732) INFO: 2epoch:train:9572-10134batch: iter_time=3.072e-04, forward_time=0.205, loss_att=259.029, acc=0.770, loss=259.029, backward_time=0.299, grad_norm=94.531, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.282e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:11:25,923 (trainer:732) INFO: 2epoch:train:10135-10697batch: iter_time=3.169e-04, forward_time=0.204, loss_att=255.413, acc=0.771, loss=255.413, backward_time=0.298, grad_norm=95.514, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.423e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:17:44,391 (trainer:732) INFO: 2epoch:train:10698-11260batch: iter_time=3.086e-04, forward_time=0.205, loss_att=250.945, acc=0.775, loss=250.945, backward_time=0.299, grad_norm=97.152, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.564e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:26:30,080 (trainer:338) INFO: 2epoch results: [train] iter_time=4.141e-04, forward_time=0.204, loss_att=277.388, acc=0.750, loss=277.388, backward_time=0.299, grad_norm=92.991, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.229e-04, train_time=2.753, time=2 hours, 9 minutes and 32.39 seconds, total_count=22544, gpu_max_cached_mem_GB=30.271, [valid] loss_att=103.925, acc=0.795, cer=0.241, wer=0.561, loss=103.925, time=4 minutes and 54.36 seconds, total_count=112, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 29.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:26:33,929 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:26:33,931 (trainer:272) INFO: 3/60epoch started. Estimated time to finish: 5 days, 16 hours and 35 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:35:51,326 (trainer:732) INFO: 3epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=244.467, acc=0.777, loss=244.467, backward_time=0.298, grad_norm=96.046, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.708e-04, train_time=3.967 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:42:08,746 (trainer:732) INFO: 3epoch:train:564-1126batch: iter_time=2.912e-04, forward_time=0.204, loss_att=239.813, acc=0.779, loss=239.813, backward_time=0.298, grad_norm=89.544, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.848e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:48:29,425 (trainer:732) INFO: 3epoch:train:1127-1689batch: iter_time=2.845e-04, forward_time=0.206, loss_att=249.686, acc=0.780, loss=249.686, backward_time=0.301, grad_norm=98.987, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.989e-04, train_time=2.702 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 21:54:51,626 (trainer:732) INFO: 3epoch:train:1690-2252batch: iter_time=3.224e-04, forward_time=0.206, loss_att=240.102, acc=0.782, loss=240.102, backward_time=0.298, grad_norm=92.304, clip=100.000, loss_scale=1.000, optim_step_time=0.075, optim0_lr0=6.130e-04, train_time=2.715 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:01:08,831 (trainer:732) INFO: 3epoch:train:2253-2815batch: iter_time=3.046e-04, forward_time=0.204, loss_att=232.117, acc=0.785, loss=232.117, backward_time=0.297, grad_norm=90.864, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.270e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:07:26,915 (trainer:732) INFO: 3epoch:train:2816-3378batch: iter_time=2.928e-04, forward_time=0.204, loss_att=234.244, acc=0.785, loss=234.244, backward_time=0.298, grad_norm=93.109, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.411e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:13:48,023 (trainer:732) INFO: 3epoch:train:3379-3941batch: iter_time=3.024e-04, forward_time=0.206, loss_att=243.396, acc=0.787, loss=243.396, backward_time=0.300, grad_norm=98.806, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.552e-04, train_time=2.708 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:20:08,205 (trainer:732) INFO: 3epoch:train:3942-4504batch: iter_time=2.936e-04, forward_time=0.205, loss_att=237.898, acc=0.792, loss=237.898, backward_time=0.299, grad_norm=93.191, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.693e-04, train_time=2.700 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:26:25,527 (trainer:732) INFO: 3epoch:train:4505-5067batch: iter_time=2.798e-04, forward_time=0.204, loss_att=226.417, acc=0.791, loss=226.417, backward_time=0.298, grad_norm=92.519, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.833e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:32:44,510 (trainer:732) INFO: 3epoch:train:5068-5630batch: iter_time=2.824e-04, forward_time=0.204, loss_att=229.034, acc=0.794, loss=229.034, backward_time=0.299, grad_norm=96.577, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.974e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:39:01,062 (trainer:732) INFO: 3epoch:train:5631-6193batch: iter_time=2.768e-04, forward_time=0.203, loss_att=220.016, acc=0.798, loss=220.016, backward_time=0.297, grad_norm=91.416, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.115e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:45:19,228 (trainer:732) INFO: 3epoch:train:6194-6756batch: iter_time=2.911e-04, forward_time=0.204, loss_att=223.024, acc=0.801, loss=223.024, backward_time=0.299, grad_norm=95.484, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.256e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:51:37,162 (trainer:732) INFO: 3epoch:train:6757-7319batch: iter_time=2.891e-04, forward_time=0.204, loss_att=217.163, acc=0.803, loss=217.163, backward_time=0.299, grad_norm=95.446, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.396e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 22:57:57,731 (trainer:732) INFO: 3epoch:train:7320-7882batch: iter_time=2.782e-04, forward_time=0.205, loss_att=217.493, acc=0.809, loss=217.493, backward_time=0.300, grad_norm=100.936, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.537e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:04:15,634 (trainer:732) INFO: 3epoch:train:7883-8445batch: iter_time=2.833e-04, forward_time=0.204, loss_att=210.054, acc=0.811, loss=210.054, backward_time=0.298, grad_norm=97.447, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.678e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:10:33,969 (trainer:732) INFO: 3epoch:train:8446-9008batch: iter_time=2.824e-04, forward_time=0.204, loss_att=205.122, acc=0.815, loss=205.122, backward_time=0.298, grad_norm=100.251, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.819e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:16:53,699 (trainer:732) INFO: 3epoch:train:9009-9571batch: iter_time=2.855e-04, forward_time=0.205, loss_att=199.640, acc=0.819, loss=199.640, backward_time=0.300, grad_norm=102.703, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.959e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:23:12,615 (trainer:732) INFO: 3epoch:train:9572-10134batch: iter_time=2.838e-04, forward_time=0.205, loss_att=202.907, acc=0.821, loss=202.907, backward_time=0.300, grad_norm=96.260, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.100e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:29:28,094 (trainer:732) INFO: 3epoch:train:10135-10697batch: iter_time=2.841e-04, forward_time=0.202, loss_att=191.972, acc=0.823, loss=191.972, backward_time=0.296, grad_norm=96.053, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.241e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:35:45,849 (trainer:732) INFO: 3epoch:train:10698-11260batch: iter_time=2.789e-04, forward_time=0.204, loss_att=192.313, acc=0.828, loss=192.313, backward_time=0.298, grad_norm=98.493, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.382e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:44:45,467 (trainer:338) INFO: 3epoch results: [train] iter_time=3.895e-04, forward_time=0.204, loss_att=222.686, acc=0.799, loss=222.686, backward_time=0.299, grad_norm=95.822, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.046e-04, train_time=2.753, time=2 hours, 9 minutes and 35.77 seconds, total_count=33816, gpu_max_cached_mem_GB=30.271, [valid] loss_att=58.056, acc=0.888, cer=0.133, wer=0.414, loss=58.056, time=5 minutes and 6.81 seconds, total_count=168, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 28.96 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:44:49,361 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:44:49,363 (trainer:272) INFO: 4/60epoch started. Estimated time to finish: 5 days, 13 hours and 16 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-21 23:54:05,715 (trainer:732) INFO: 4epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=185.692, acc=0.831, loss=185.692, backward_time=0.296, grad_norm=93.605, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.525e-04, train_time=3.960 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:00:24,734 (trainer:732) INFO: 4epoch:train:564-1126batch: iter_time=3.213e-04, forward_time=0.205, loss_att=185.419, acc=0.834, loss=185.419, backward_time=0.300, grad_norm=103.455, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.666e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:06:43,222 (trainer:732) INFO: 4epoch:train:1127-1689batch: iter_time=3.158e-04, forward_time=0.204, loss_att=179.620, acc=0.837, loss=179.620, backward_time=0.299, grad_norm=100.315, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.807e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:13:00,555 (trainer:732) INFO: 4epoch:train:1690-2252batch: iter_time=3.108e-04, forward_time=0.204, loss_att=178.533, acc=0.838, loss=178.533, backward_time=0.297, grad_norm=101.916, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.948e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:19:20,305 (trainer:732) INFO: 4epoch:train:2253-2815batch: iter_time=3.162e-04, forward_time=0.205, loss_att=178.868, acc=0.841, loss=178.868, backward_time=0.300, grad_norm=107.042, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.088e-04, train_time=2.699 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:25:37,744 (trainer:732) INFO: 4epoch:train:2816-3378batch: iter_time=3.121e-04, forward_time=0.204, loss_att=173.029, acc=0.844, loss=173.029, backward_time=0.298, grad_norm=103.218, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.229e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:31:56,616 (trainer:732) INFO: 4epoch:train:3379-3941batch: iter_time=3.151e-04, forward_time=0.205, loss_att=176.368, acc=0.843, loss=176.368, backward_time=0.299, grad_norm=105.706, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.370e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:38:12,463 (trainer:732) INFO: 4epoch:train:3942-4504batch: iter_time=3.136e-04, forward_time=0.203, loss_att=168.582, acc=0.846, loss=168.582, backward_time=0.296, grad_norm=100.027, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.511e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:44:33,220 (trainer:732) INFO: 4epoch:train:4505-5067batch: iter_time=3.144e-04, forward_time=0.205, loss_att=171.880, acc=0.848, loss=171.880, backward_time=0.300, grad_norm=107.728, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.651e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:50:53,829 (trainer:732) INFO: 4epoch:train:5068-5630batch: iter_time=3.180e-04, forward_time=0.205, loss_att=171.321, acc=0.850, loss=171.321, backward_time=0.300, grad_norm=103.946, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.792e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 00:57:12,989 (trainer:732) INFO: 4epoch:train:5631-6193batch: iter_time=3.116e-04, forward_time=0.205, loss_att=166.828, acc=0.851, loss=166.828, backward_time=0.299, grad_norm=107.215, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.933e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:03:31,425 (trainer:732) INFO: 4epoch:train:6194-6756batch: iter_time=3.157e-04, forward_time=0.204, loss_att=167.930, acc=0.850, loss=167.930, backward_time=0.298, grad_norm=101.290, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:09:49,561 (trainer:732) INFO: 4epoch:train:6757-7319batch: iter_time=3.172e-04, forward_time=0.204, loss_att=165.632, acc=0.852, loss=165.632, backward_time=0.299, grad_norm=108.167, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:16:07,883 (trainer:732) INFO: 4epoch:train:7320-7882batch: iter_time=3.151e-04, forward_time=0.205, loss_att=163.767, acc=0.853, loss=163.767, backward_time=0.299, grad_norm=103.070, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:22:24,312 (trainer:732) INFO: 4epoch:train:7883-8445batch: iter_time=3.214e-04, forward_time=0.203, loss_att=158.855, acc=0.854, loss=158.855, backward_time=0.297, grad_norm=103.986, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:28:41,761 (trainer:732) INFO: 4epoch:train:8446-9008batch: iter_time=3.125e-04, forward_time=0.204, loss_att=163.004, acc=0.856, loss=163.004, backward_time=0.298, grad_norm=103.197, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:34:59,609 (trainer:732) INFO: 4epoch:train:9009-9571batch: iter_time=3.063e-04, forward_time=0.204, loss_att=158.392, acc=0.858, loss=158.392, backward_time=0.299, grad_norm=104.899, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:41:16,743 (trainer:732) INFO: 4epoch:train:9572-10134batch: iter_time=3.086e-04, forward_time=0.204, loss_att=158.593, acc=0.857, loss=158.593, backward_time=0.297, grad_norm=106.680, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:47:36,316 (trainer:732) INFO: 4epoch:train:10135-10697batch: iter_time=3.130e-04, forward_time=0.205, loss_att=157.249, acc=0.860, loss=157.249, backward_time=0.300, grad_norm=105.734, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 01:53:54,723 (trainer:732) INFO: 4epoch:train:10698-11260batch: iter_time=3.068e-04, forward_time=0.204, loss_att=157.049, acc=0.862, loss=157.049, backward_time=0.299, grad_norm=106.182, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:03:02,444 (trainer:338) INFO: 4epoch results: [train] iter_time=4.640e-04, forward_time=0.204, loss_att=169.302, acc=0.848, loss=169.302, backward_time=0.298, grad_norm=103.853, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.864e-04, train_time=2.751, time=2 hours, 9 minutes and 29.8 seconds, total_count=45088, gpu_max_cached_mem_GB=30.271, [valid] loss_att=37.569, acc=0.929, cer=0.085, wer=0.290, loss=37.569, time=5 minutes and 15.85 seconds, total_count=224, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 27.43 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:03:06,621 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:03:06,624 (trainer:272) INFO: 5/60epoch started. Estimated time to finish: 5 days, 10 hours and 28 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:12:21,349 (trainer:732) INFO: 5epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=151.255, acc=0.864, loss=151.255, backward_time=0.298, grad_norm=106.519, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=3.948 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:18:40,537 (trainer:732) INFO: 5epoch:train:564-1126batch: iter_time=3.052e-04, forward_time=0.204, loss_att=150.547, acc=0.866, loss=150.547, backward_time=0.299, grad_norm=107.677, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:24:57,712 (trainer:732) INFO: 5epoch:train:1127-1689batch: iter_time=2.973e-04, forward_time=0.203, loss_att=149.798, acc=0.865, loss=149.798, backward_time=0.297, grad_norm=104.010, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:31:16,600 (trainer:732) INFO: 5epoch:train:1690-2252batch: iter_time=3.057e-04, forward_time=0.205, loss_att=151.521, acc=0.866, loss=151.521, backward_time=0.299, grad_norm=111.604, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:37:36,238 (trainer:732) INFO: 5epoch:train:2253-2815batch: iter_time=3.150e-04, forward_time=0.205, loss_att=151.246, acc=0.868, loss=151.246, backward_time=0.300, grad_norm=102.298, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.699 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:43:55,180 (trainer:732) INFO: 5epoch:train:2816-3378batch: iter_time=3.010e-04, forward_time=0.204, loss_att=146.667, acc=0.869, loss=146.667, backward_time=0.299, grad_norm=110.855, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:50:14,346 (trainer:732) INFO: 5epoch:train:3379-3941batch: iter_time=3.024e-04, forward_time=0.205, loss_att=145.020, acc=0.869, loss=145.020, backward_time=0.299, grad_norm=105.945, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 02:56:31,379 (trainer:732) INFO: 5epoch:train:3942-4504batch: iter_time=3.013e-04, forward_time=0.203, loss_att=143.349, acc=0.871, loss=143.349, backward_time=0.297, grad_norm=102.545, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:02:47,340 (trainer:732) INFO: 5epoch:train:4505-5067batch: iter_time=2.890e-04, forward_time=0.203, loss_att=141.402, acc=0.871, loss=141.402, backward_time=0.297, grad_norm=103.170, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:09:02,936 (trainer:732) INFO: 5epoch:train:5068-5630batch: iter_time=3.135e-04, forward_time=0.203, loss_att=141.652, acc=0.871, loss=141.652, backward_time=0.296, grad_norm=102.868, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:15:21,255 (trainer:732) INFO: 5epoch:train:5631-6193batch: iter_time=2.998e-04, forward_time=0.204, loss_att=141.666, acc=0.872, loss=141.666, backward_time=0.298, grad_norm=105.111, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:21:40,132 (trainer:732) INFO: 5epoch:train:6194-6756batch: iter_time=2.976e-04, forward_time=0.204, loss_att=140.760, acc=0.873, loss=140.760, backward_time=0.299, grad_norm=106.068, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:27:56,918 (trainer:732) INFO: 5epoch:train:6757-7319batch: iter_time=2.999e-04, forward_time=0.204, loss_att=139.739, acc=0.873, loss=139.739, backward_time=0.297, grad_norm=100.718, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:34:15,946 (trainer:732) INFO: 5epoch:train:7320-7882batch: iter_time=2.946e-04, forward_time=0.205, loss_att=140.907, acc=0.875, loss=140.907, backward_time=0.299, grad_norm=105.801, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:40:34,536 (trainer:732) INFO: 5epoch:train:7883-8445batch: iter_time=3.000e-04, forward_time=0.204, loss_att=139.993, acc=0.875, loss=139.993, backward_time=0.299, grad_norm=106.601, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:46:54,457 (trainer:732) INFO: 5epoch:train:8446-9008batch: iter_time=2.924e-04, forward_time=0.205, loss_att=139.805, acc=0.876, loss=139.805, backward_time=0.300, grad_norm=108.356, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:53:12,861 (trainer:732) INFO: 5epoch:train:9009-9571batch: iter_time=2.992e-04, forward_time=0.204, loss_att=135.477, acc=0.879, loss=135.477, backward_time=0.298, grad_norm=105.728, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 03:59:31,351 (trainer:732) INFO: 5epoch:train:9572-10134batch: iter_time=2.896e-04, forward_time=0.204, loss_att=140.189, acc=0.877, loss=140.189, backward_time=0.299, grad_norm=105.188, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:05:47,638 (trainer:732) INFO: 5epoch:train:10135-10697batch: iter_time=3.046e-04, forward_time=0.203, loss_att=135.406, acc=0.878, loss=135.406, backward_time=0.297, grad_norm=105.742, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:12:05,806 (trainer:732) INFO: 5epoch:train:10698-11260batch: iter_time=2.887e-04, forward_time=0.204, loss_att=135.573, acc=0.879, loss=135.573, backward_time=0.298, grad_norm=100.197, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:21:09,449 (trainer:338) INFO: 5epoch results: [train] iter_time=4.028e-04, forward_time=0.204, loss_att=143.092, acc=0.872, loss=143.092, backward_time=0.298, grad_norm=105.378, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.749, time=2 hours, 9 minutes and 23.67 seconds, total_count=56360, gpu_max_cached_mem_GB=30.271, [valid] loss_att=31.406, acc=0.941, cer=0.071, wer=0.249, loss=31.406, time=5 minutes and 9.51 seconds, total_count=280, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 29.64 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:21:13,760 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:21:13,763 (trainer:272) INFO: 6/60epoch started. Estimated time to finish: 5 days, 7 hours and 49 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:30:33,396 (trainer:732) INFO: 6epoch:train:1-563batch: iter_time=0.003, forward_time=0.205, loss_att=134.302, acc=0.882, loss=134.302, backward_time=0.300, grad_norm=109.339, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=3.984 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:36:52,443 (trainer:732) INFO: 6epoch:train:564-1126batch: iter_time=3.033e-04, forward_time=0.205, loss_att=131.103, acc=0.882, loss=131.103, backward_time=0.299, grad_norm=103.884, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:43:10,446 (trainer:732) INFO: 6epoch:train:1127-1689batch: iter_time=3.091e-04, forward_time=0.204, loss_att=132.498, acc=0.882, loss=132.498, backward_time=0.298, grad_norm=107.626, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:49:29,426 (trainer:732) INFO: 6epoch:train:1690-2252batch: iter_time=2.997e-04, forward_time=0.205, loss_att=129.545, acc=0.884, loss=129.545, backward_time=0.299, grad_norm=103.932, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 04:55:48,192 (trainer:732) INFO: 6epoch:train:2253-2815batch: iter_time=3.056e-04, forward_time=0.204, loss_att=132.168, acc=0.883, loss=132.168, backward_time=0.299, grad_norm=111.430, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:02:05,118 (trainer:732) INFO: 6epoch:train:2816-3378batch: iter_time=2.974e-04, forward_time=0.203, loss_att=129.619, acc=0.884, loss=129.619, backward_time=0.297, grad_norm=103.188, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:08:23,046 (trainer:732) INFO: 6epoch:train:3379-3941batch: iter_time=3.034e-04, forward_time=0.204, loss_att=126.676, acc=0.884, loss=126.676, backward_time=0.298, grad_norm=102.889, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:14:41,170 (trainer:732) INFO: 6epoch:train:3942-4504batch: iter_time=3.053e-04, forward_time=0.204, loss_att=129.207, acc=0.885, loss=129.207, backward_time=0.298, grad_norm=105.342, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:20:58,338 (trainer:732) INFO: 6epoch:train:4505-5067batch: iter_time=3.051e-04, forward_time=0.204, loss_att=125.745, acc=0.885, loss=125.745, backward_time=0.298, grad_norm=105.587, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:27:16,827 (trainer:732) INFO: 6epoch:train:5068-5630batch: iter_time=3.111e-04, forward_time=0.204, loss_att=128.542, acc=0.885, loss=128.542, backward_time=0.299, grad_norm=137.375, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:33:33,560 (trainer:732) INFO: 6epoch:train:5631-6193batch: iter_time=3.010e-04, forward_time=0.203, loss_att=125.696, acc=0.886, loss=125.696, backward_time=0.297, grad_norm=103.947, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:39:51,710 (trainer:732) INFO: 6epoch:train:6194-6756batch: iter_time=3.055e-04, forward_time=0.204, loss_att=125.716, acc=0.887, loss=125.716, backward_time=0.298, grad_norm=103.966, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:46:09,947 (trainer:732) INFO: 6epoch:train:6757-7319batch: iter_time=2.997e-04, forward_time=0.204, loss_att=124.605, acc=0.888, loss=124.605, backward_time=0.298, grad_norm=107.760, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:52:27,197 (trainer:732) INFO: 6epoch:train:7320-7882batch: iter_time=2.953e-04, forward_time=0.204, loss_att=124.589, acc=0.887, loss=124.589, backward_time=0.298, grad_norm=106.176, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 05:58:47,416 (trainer:732) INFO: 6epoch:train:7883-8445batch: iter_time=3.002e-04, forward_time=0.205, loss_att=126.851, acc=0.889, loss=126.851, backward_time=0.300, grad_norm=107.765, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.700 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:05:05,846 (trainer:732) INFO: 6epoch:train:8446-9008batch: iter_time=2.996e-04, forward_time=0.204, loss_att=125.025, acc=0.888, loss=125.025, backward_time=0.299, grad_norm=104.561, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:11:22,594 (trainer:732) INFO: 6epoch:train:9009-9571batch: iter_time=3.008e-04, forward_time=0.203, loss_att=118.226, acc=0.891, loss=118.226, backward_time=0.297, grad_norm=105.187, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:17:41,986 (trainer:732) INFO: 6epoch:train:9572-10134batch: iter_time=3.080e-04, forward_time=0.205, loss_att=126.761, acc=0.890, loss=126.761, backward_time=0.300, grad_norm=107.331, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:23:59,408 (trainer:732) INFO: 6epoch:train:10135-10697batch: iter_time=2.993e-04, forward_time=0.204, loss_att=121.671, acc=0.890, loss=121.671, backward_time=0.298, grad_norm=105.738, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:30:17,562 (trainer:732) INFO: 6epoch:train:10698-11260batch: iter_time=2.982e-04, forward_time=0.204, loss_att=121.656, acc=0.891, loss=121.656, backward_time=0.299, grad_norm=103.615, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:39:01,235 (trainer:338) INFO: 6epoch results: [train] iter_time=4.517e-04, forward_time=0.204, loss_att=126.972, acc=0.886, loss=126.972, backward_time=0.298, grad_norm=107.327, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.750, time=2 hours, 9 minutes and 25.18 seconds, total_count=67632, gpu_max_cached_mem_GB=30.271, [valid] loss_att=26.213, acc=0.951, cer=0.059, wer=0.210, loss=26.213, time=4 minutes and 52.52 seconds, total_count=336, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 29.77 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:39:05,454 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:39:05,458 (trainer:272) INFO: 7/60epoch started. Estimated time to finish: 5 days, 5 hours and 16 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:48:20,179 (trainer:732) INFO: 7epoch:train:1-563batch: iter_time=0.002, forward_time=0.205, loss_att=121.636, acc=0.892, loss=121.636, backward_time=0.299, grad_norm=105.890, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=3.949 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 06:54:37,325 (trainer:732) INFO: 7epoch:train:564-1126batch: iter_time=3.106e-04, forward_time=0.203, loss_att=116.848, acc=0.894, loss=116.848, backward_time=0.297, grad_norm=105.589, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:00:55,149 (trainer:732) INFO: 7epoch:train:1127-1689batch: iter_time=3.075e-04, forward_time=0.204, loss_att=117.410, acc=0.894, loss=117.410, backward_time=0.298, grad_norm=107.718, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:07:12,862 (trainer:732) INFO: 7epoch:train:1690-2252batch: iter_time=3.000e-04, forward_time=0.204, loss_att=117.751, acc=0.894, loss=117.751, backward_time=0.298, grad_norm=102.494, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:13:32,094 (trainer:732) INFO: 7epoch:train:2253-2815batch: iter_time=2.977e-04, forward_time=0.205, loss_att=119.132, acc=0.893, loss=119.132, backward_time=0.300, grad_norm=102.841, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:19:46,677 (trainer:732) INFO: 7epoch:train:2816-3378batch: iter_time=2.569e-04, forward_time=0.202, loss_att=118.466, acc=0.894, loss=118.466, backward_time=0.296, grad_norm=106.201, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:25:58,747 (trainer:732) INFO: 7epoch:train:3379-3941batch: iter_time=2.244e-04, forward_time=0.201, loss_att=116.738, acc=0.896, loss=116.738, backward_time=0.293, grad_norm=101.639, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:32:12,347 (trainer:732) INFO: 7epoch:train:3942-4504batch: iter_time=2.333e-04, forward_time=0.201, loss_att=118.958, acc=0.894, loss=118.958, backward_time=0.295, grad_norm=106.786, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:38:24,445 (trainer:732) INFO: 7epoch:train:4505-5067batch: iter_time=2.219e-04, forward_time=0.201, loss_att=117.522, acc=0.896, loss=117.522, backward_time=0.294, grad_norm=107.844, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.644 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:44:36,421 (trainer:732) INFO: 7epoch:train:5068-5630batch: iter_time=2.275e-04, forward_time=0.201, loss_att=114.643, acc=0.895, loss=114.643, backward_time=0.293, grad_norm=106.423, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.641 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:50:47,929 (trainer:732) INFO: 7epoch:train:5631-6193batch: iter_time=2.266e-04, forward_time=0.201, loss_att=115.339, acc=0.896, loss=115.339, backward_time=0.293, grad_norm=108.284, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.641 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 07:56:58,736 (trainer:732) INFO: 7epoch:train:6194-6756batch: iter_time=2.217e-04, forward_time=0.201, loss_att=117.127, acc=0.894, loss=117.127, backward_time=0.293, grad_norm=101.070, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.633 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:03:13,256 (trainer:732) INFO: 7epoch:train:6757-7319batch: iter_time=2.301e-04, forward_time=0.202, loss_att=118.972, acc=0.895, loss=118.972, backward_time=0.296, grad_norm=106.392, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:09:28,267 (trainer:732) INFO: 7epoch:train:7320-7882batch: iter_time=2.321e-04, forward_time=0.203, loss_att=117.685, acc=0.897, loss=117.685, backward_time=0.296, grad_norm=108.593, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:15:39,601 (trainer:732) INFO: 7epoch:train:7883-8445batch: iter_time=2.262e-04, forward_time=0.200, loss_att=114.498, acc=0.896, loss=114.498, backward_time=0.293, grad_norm=104.835, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.637 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:21:51,250 (trainer:732) INFO: 7epoch:train:8446-9008batch: iter_time=2.302e-04, forward_time=0.201, loss_att=109.918, acc=0.898, loss=109.918, backward_time=0.293, grad_norm=108.541, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.640 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:28:02,894 (trainer:732) INFO: 7epoch:train:9009-9571batch: iter_time=2.234e-04, forward_time=0.201, loss_att=114.814, acc=0.896, loss=114.814, backward_time=0.293, grad_norm=122.838, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:34:15,566 (trainer:732) INFO: 7epoch:train:9572-10134batch: iter_time=2.217e-04, forward_time=0.201, loss_att=114.947, acc=0.899, loss=114.947, backward_time=0.294, grad_norm=107.099, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.647 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:40:29,773 (trainer:732) INFO: 7epoch:train:10135-10697batch: iter_time=2.254e-04, forward_time=0.202, loss_att=114.206, acc=0.898, loss=114.206, backward_time=0.295, grad_norm=110.314, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:46:42,537 (trainer:732) INFO: 7epoch:train:10698-11260batch: iter_time=2.206e-04, forward_time=0.201, loss_att=115.182, acc=0.897, loss=115.182, backward_time=0.293, grad_norm=103.971, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.647 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:54:44,820 (trainer:338) INFO: 7epoch results: [train] iter_time=3.469e-04, forward_time=0.202, loss_att=116.575, acc=0.895, loss=116.575, backward_time=0.295, grad_norm=106.785, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.720, time=2 hours, 7 minutes and 53.26 seconds, total_count=78904, gpu_max_cached_mem_GB=30.271, [valid] loss_att=23.964, acc=0.956, cer=0.054, wer=0.194, loss=23.964, time=4 minutes and 37.92 seconds, total_count=392, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 8.17 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:54:48,309 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 08:54:48,313 (trainer:272) INFO: 8/60epoch started. Estimated time to finish: 5 days, 2 hours and 30 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:03:30,841 (trainer:732) INFO: 8epoch:train:1-563batch: iter_time=0.002, forward_time=0.201, loss_att=112.957, acc=0.899, loss=112.957, backward_time=0.293, grad_norm=103.592, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=3.719 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:09:42,718 (trainer:732) INFO: 8epoch:train:564-1126batch: iter_time=2.275e-04, forward_time=0.201, loss_att=110.855, acc=0.900, loss=110.855, backward_time=0.294, grad_norm=101.799, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:15:56,135 (trainer:732) INFO: 8epoch:train:1127-1689batch: iter_time=2.311e-04, forward_time=0.201, loss_att=110.686, acc=0.900, loss=110.686, backward_time=0.294, grad_norm=106.551, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:22:09,038 (trainer:732) INFO: 8epoch:train:1690-2252batch: iter_time=2.285e-04, forward_time=0.202, loss_att=108.700, acc=0.902, loss=108.700, backward_time=0.294, grad_norm=102.817, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.648 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:28:20,247 (trainer:732) INFO: 8epoch:train:2253-2815batch: iter_time=2.264e-04, forward_time=0.201, loss_att=107.321, acc=0.903, loss=107.321, backward_time=0.293, grad_norm=101.662, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.640 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:34:35,138 (trainer:732) INFO: 8epoch:train:2816-3378batch: iter_time=2.359e-04, forward_time=0.203, loss_att=112.191, acc=0.902, loss=112.191, backward_time=0.296, grad_norm=101.782, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:40:47,216 (trainer:732) INFO: 8epoch:train:3379-3941batch: iter_time=2.282e-04, forward_time=0.201, loss_att=107.829, acc=0.903, loss=107.829, backward_time=0.294, grad_norm=110.578, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.644 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:47:01,664 (trainer:732) INFO: 8epoch:train:3942-4504batch: iter_time=2.328e-04, forward_time=0.202, loss_att=111.409, acc=0.902, loss=111.409, backward_time=0.295, grad_norm=100.921, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:53:16,824 (trainer:732) INFO: 8epoch:train:4505-5067batch: iter_time=2.370e-04, forward_time=0.202, loss_att=106.556, acc=0.904, loss=106.556, backward_time=0.297, grad_norm=103.321, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 09:59:31,448 (trainer:732) INFO: 8epoch:train:5068-5630batch: iter_time=2.486e-04, forward_time=0.202, loss_att=106.658, acc=0.903, loss=106.658, backward_time=0.296, grad_norm=99.585, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:05:45,090 (trainer:732) INFO: 8epoch:train:5631-6193batch: iter_time=2.405e-04, forward_time=0.201, loss_att=104.093, acc=0.904, loss=104.093, backward_time=0.296, grad_norm=103.984, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:12:00,809 (trainer:732) INFO: 8epoch:train:6194-6756batch: iter_time=2.353e-04, forward_time=0.202, loss_att=107.911, acc=0.904, loss=107.911, backward_time=0.297, grad_norm=106.376, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:18:16,021 (trainer:732) INFO: 8epoch:train:6757-7319batch: iter_time=2.422e-04, forward_time=0.202, loss_att=107.027, acc=0.904, loss=107.027, backward_time=0.297, grad_norm=99.949, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:24:32,022 (trainer:732) INFO: 8epoch:train:7320-7882batch: iter_time=2.364e-04, forward_time=0.202, loss_att=107.012, acc=0.905, loss=107.012, backward_time=0.297, grad_norm=105.705, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:30:46,362 (trainer:732) INFO: 8epoch:train:7883-8445batch: iter_time=2.513e-04, forward_time=0.202, loss_att=103.688, acc=0.906, loss=103.688, backward_time=0.296, grad_norm=94.588, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:37:01,422 (trainer:732) INFO: 8epoch:train:8446-9008batch: iter_time=2.326e-04, forward_time=0.202, loss_att=106.047, acc=0.905, loss=106.047, backward_time=0.297, grad_norm=99.384, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:43:16,552 (trainer:732) INFO: 8epoch:train:9009-9571batch: iter_time=2.451e-04, forward_time=0.202, loss_att=104.308, acc=0.907, loss=104.308, backward_time=0.297, grad_norm=99.742, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:49:33,196 (trainer:732) INFO: 8epoch:train:9572-10134batch: iter_time=2.413e-04, forward_time=0.202, loss_att=105.610, acc=0.908, loss=105.610, backward_time=0.298, grad_norm=106.946, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 10:55:49,922 (trainer:732) INFO: 8epoch:train:10135-10697batch: iter_time=2.393e-04, forward_time=0.202, loss_att=102.542, acc=0.908, loss=102.542, backward_time=0.298, grad_norm=103.670, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:02:03,506 (trainer:732) INFO: 8epoch:train:10698-11260batch: iter_time=2.375e-04, forward_time=0.201, loss_att=98.191, acc=0.908, loss=98.191, backward_time=0.295, grad_norm=97.249, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:10:05,972 (trainer:338) INFO: 8epoch results: [train] iter_time=3.427e-04, forward_time=0.202, loss_att=107.040, acc=0.904, loss=107.040, backward_time=0.296, grad_norm=102.511, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.712, time=2 hours, 7 minutes and 27.59 seconds, total_count=90176, gpu_max_cached_mem_GB=30.271, [valid] loss_att=20.247, acc=0.963, cer=0.046, wer=0.166, loss=20.247, time=4 minutes and 35.95 seconds, total_count=448, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.11 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:10:09,457 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:10:09,462 (trainer:272) INFO: 9/60epoch started. Estimated time to finish: 4 days, 23 hours and 50 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:18:57,781 (trainer:732) INFO: 9epoch:train:1-563batch: iter_time=0.002, forward_time=0.201, loss_att=96.347, acc=0.911, loss=96.347, backward_time=0.295, grad_norm=104.407, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=3.761 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:25:12,161 (trainer:732) INFO: 9epoch:train:564-1126batch: iter_time=2.683e-04, forward_time=0.202, loss_att=99.348, acc=0.911, loss=99.348, backward_time=0.296, grad_norm=100.497, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:31:27,746 (trainer:732) INFO: 9epoch:train:1127-1689batch: iter_time=2.676e-04, forward_time=0.202, loss_att=98.651, acc=0.911, loss=98.651, backward_time=0.297, grad_norm=99.098, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:37:42,307 (trainer:732) INFO: 9epoch:train:1690-2252batch: iter_time=2.631e-04, forward_time=0.202, loss_att=97.515, acc=0.911, loss=97.515, backward_time=0.297, grad_norm=100.211, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:43:56,404 (trainer:732) INFO: 9epoch:train:2253-2815batch: iter_time=2.498e-04, forward_time=0.201, loss_att=97.998, acc=0.912, loss=97.998, backward_time=0.296, grad_norm=97.160, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:50:12,301 (trainer:732) INFO: 9epoch:train:2816-3378batch: iter_time=2.574e-04, forward_time=0.202, loss_att=96.538, acc=0.912, loss=96.538, backward_time=0.297, grad_norm=106.586, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 11:56:28,233 (trainer:732) INFO: 9epoch:train:3379-3941batch: iter_time=2.570e-04, forward_time=0.202, loss_att=100.525, acc=0.911, loss=100.525, backward_time=0.298, grad_norm=102.726, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:02:43,486 (trainer:732) INFO: 9epoch:train:3942-4504batch: iter_time=2.613e-04, forward_time=0.202, loss_att=99.952, acc=0.912, loss=99.952, backward_time=0.297, grad_norm=106.272, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:08:57,961 (trainer:732) INFO: 9epoch:train:4505-5067batch: iter_time=2.563e-04, forward_time=0.202, loss_att=97.303, acc=0.912, loss=97.303, backward_time=0.297, grad_norm=98.405, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:15:12,062 (trainer:732) INFO: 9epoch:train:5068-5630batch: iter_time=2.559e-04, forward_time=0.201, loss_att=95.495, acc=0.913, loss=95.495, backward_time=0.295, grad_norm=99.136, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:21:27,403 (trainer:732) INFO: 9epoch:train:5631-6193batch: iter_time=2.530e-04, forward_time=0.202, loss_att=96.456, acc=0.913, loss=96.456, backward_time=0.297, grad_norm=98.647, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:27:44,617 (trainer:732) INFO: 9epoch:train:6194-6756batch: iter_time=2.431e-04, forward_time=0.203, loss_att=96.601, acc=0.914, loss=96.601, backward_time=0.299, grad_norm=94.905, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:34:00,558 (trainer:732) INFO: 9epoch:train:6757-7319batch: iter_time=2.480e-04, forward_time=0.202, loss_att=97.571, acc=0.913, loss=97.571, backward_time=0.297, grad_norm=99.480, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:40:15,437 (trainer:732) INFO: 9epoch:train:7320-7882batch: iter_time=2.407e-04, forward_time=0.202, loss_att=95.451, acc=0.913, loss=95.451, backward_time=0.296, grad_norm=98.129, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:46:30,725 (trainer:732) INFO: 9epoch:train:7883-8445batch: iter_time=2.465e-04, forward_time=0.201, loss_att=96.039, acc=0.913, loss=96.039, backward_time=0.296, grad_norm=96.292, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:52:48,725 (trainer:732) INFO: 9epoch:train:8446-9008batch: iter_time=2.424e-04, forward_time=0.203, loss_att=95.394, acc=0.916, loss=95.394, backward_time=0.299, grad_norm=107.682, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 12:59:03,762 (trainer:732) INFO: 9epoch:train:9009-9571batch: iter_time=2.358e-04, forward_time=0.202, loss_att=97.797, acc=0.913, loss=97.797, backward_time=0.297, grad_norm=100.636, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:05:18,531 (trainer:732) INFO: 9epoch:train:9572-10134batch: iter_time=2.460e-04, forward_time=0.202, loss_att=95.332, acc=0.915, loss=95.332, backward_time=0.297, grad_norm=98.857, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:11:34,103 (trainer:732) INFO: 9epoch:train:10135-10697batch: iter_time=2.456e-04, forward_time=0.202, loss_att=93.126, acc=0.916, loss=93.126, backward_time=0.297, grad_norm=94.435, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:17:49,659 (trainer:732) INFO: 9epoch:train:10698-11260batch: iter_time=2.393e-04, forward_time=0.202, loss_att=93.476, acc=0.917, loss=93.476, backward_time=0.297, grad_norm=100.137, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:25:51,191 (trainer:338) INFO: 9epoch results: [train] iter_time=3.511e-04, forward_time=0.202, loss_att=96.837, acc=0.913, loss=96.837, backward_time=0.297, grad_norm=100.170, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.721, time=2 hours, 7 minutes and 52.99 seconds, total_count=101448, gpu_max_cached_mem_GB=30.271, [valid] loss_att=18.791, acc=0.966, cer=0.042, wer=0.153, loss=18.791, time=4 minutes and 37.88 seconds, total_count=504, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.85 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:26:01,549 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:26:01,554 (trainer:272) INFO: 10/60epoch started. Estimated time to finish: 4 days, 21 hours and 18 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:34:54,048 (trainer:732) INFO: 10epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=93.687, acc=0.917, loss=93.687, backward_time=0.298, grad_norm=101.516, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=3.790 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:41:07,640 (trainer:732) INFO: 10epoch:train:564-1126batch: iter_time=2.541e-04, forward_time=0.201, loss_att=89.616, acc=0.918, loss=89.616, backward_time=0.295, grad_norm=95.822, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:47:22,599 (trainer:732) INFO: 10epoch:train:1127-1689batch: iter_time=2.532e-04, forward_time=0.202, loss_att=90.493, acc=0.919, loss=90.493, backward_time=0.297, grad_norm=99.606, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:53:39,956 (trainer:732) INFO: 10epoch:train:1690-2252batch: iter_time=2.608e-04, forward_time=0.203, loss_att=90.693, acc=0.919, loss=90.693, backward_time=0.298, grad_norm=97.850, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 13:59:55,781 (trainer:732) INFO: 10epoch:train:2253-2815batch: iter_time=2.501e-04, forward_time=0.203, loss_att=90.136, acc=0.919, loss=90.136, backward_time=0.298, grad_norm=102.692, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:06:10,211 (trainer:732) INFO: 10epoch:train:2816-3378batch: iter_time=2.492e-04, forward_time=0.201, loss_att=89.375, acc=0.919, loss=89.375, backward_time=0.296, grad_norm=92.813, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:12:26,175 (trainer:732) INFO: 10epoch:train:3379-3941batch: iter_time=2.541e-04, forward_time=0.203, loss_att=90.545, acc=0.919, loss=90.545, backward_time=0.298, grad_norm=101.549, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:18:41,506 (trainer:732) INFO: 10epoch:train:3942-4504batch: iter_time=2.472e-04, forward_time=0.202, loss_att=90.358, acc=0.919, loss=90.358, backward_time=0.297, grad_norm=94.216, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:24:56,552 (trainer:732) INFO: 10epoch:train:4505-5067batch: iter_time=2.586e-04, forward_time=0.202, loss_att=89.578, acc=0.918, loss=89.578, backward_time=0.296, grad_norm=96.332, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:31:12,584 (trainer:732) INFO: 10epoch:train:5068-5630batch: iter_time=2.601e-04, forward_time=0.202, loss_att=89.211, acc=0.920, loss=89.211, backward_time=0.297, grad_norm=98.479, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:37:26,876 (trainer:732) INFO: 10epoch:train:5631-6193batch: iter_time=2.547e-04, forward_time=0.201, loss_att=89.741, acc=0.919, loss=89.741, backward_time=0.296, grad_norm=100.853, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:43:42,436 (trainer:732) INFO: 10epoch:train:6194-6756batch: iter_time=2.523e-04, forward_time=0.202, loss_att=88.579, acc=0.920, loss=88.579, backward_time=0.297, grad_norm=95.675, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:49:55,139 (trainer:732) INFO: 10epoch:train:6757-7319batch: iter_time=2.531e-04, forward_time=0.201, loss_att=88.013, acc=0.920, loss=88.013, backward_time=0.295, grad_norm=94.317, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.649 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 14:56:08,407 (trainer:732) INFO: 10epoch:train:7320-7882batch: iter_time=2.471e-04, forward_time=0.201, loss_att=86.268, acc=0.921, loss=86.268, backward_time=0.295, grad_norm=96.693, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:02:25,043 (trainer:732) INFO: 10epoch:train:7883-8445batch: iter_time=2.544e-04, forward_time=0.203, loss_att=89.776, acc=0.920, loss=89.776, backward_time=0.299, grad_norm=101.419, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:08:38,384 (trainer:732) INFO: 10epoch:train:8446-9008batch: iter_time=2.447e-04, forward_time=0.201, loss_att=83.343, acc=0.922, loss=83.343, backward_time=0.295, grad_norm=94.215, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:14:55,566 (trainer:732) INFO: 10epoch:train:9009-9571batch: iter_time=2.608e-04, forward_time=0.203, loss_att=89.354, acc=0.921, loss=89.354, backward_time=0.299, grad_norm=103.609, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:21:11,446 (trainer:732) INFO: 10epoch:train:9572-10134batch: iter_time=2.476e-04, forward_time=0.202, loss_att=88.801, acc=0.920, loss=88.801, backward_time=0.298, grad_norm=95.601, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:27:25,938 (trainer:732) INFO: 10epoch:train:10135-10697batch: iter_time=2.600e-04, forward_time=0.202, loss_att=86.562, acc=0.921, loss=86.562, backward_time=0.297, grad_norm=94.217, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:33:41,885 (trainer:732) INFO: 10epoch:train:10698-11260batch: iter_time=2.484e-04, forward_time=0.202, loss_att=89.740, acc=0.921, loss=89.740, backward_time=0.297, grad_norm=96.733, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:41:44,149 (trainer:338) INFO: 10epoch results: [train] iter_time=3.484e-04, forward_time=0.202, loss_att=89.161, acc=0.920, loss=89.161, backward_time=0.297, grad_norm=97.687, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.721, time=2 hours, 7 minutes and 52.78 seconds, total_count=112720, gpu_max_cached_mem_GB=30.271, [valid] loss_att=17.799, acc=0.968, cer=0.038, wer=0.142, loss=17.799, time=4 minutes and 36.91 seconds, total_count=560, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 12.91 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:41:47,932 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:41:47,938 (trainer:272) INFO: 11/60epoch started. Estimated time to finish: 4 days, 18 hours and 49 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:50:36,406 (trainer:732) INFO: 11epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=83.219, acc=0.924, loss=83.219, backward_time=0.297, grad_norm=95.347, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=3.762 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 15:56:51,998 (trainer:732) INFO: 11epoch:train:564-1126batch: iter_time=2.605e-04, forward_time=0.203, loss_att=82.726, acc=0.925, loss=82.726, backward_time=0.298, grad_norm=91.822, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:03:09,429 (trainer:732) INFO: 11epoch:train:1127-1689batch: iter_time=2.696e-04, forward_time=0.203, loss_att=86.953, acc=0.924, loss=86.953, backward_time=0.299, grad_norm=98.112, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:09:25,018 (trainer:732) INFO: 11epoch:train:1690-2252batch: iter_time=2.632e-04, forward_time=0.202, loss_att=85.287, acc=0.924, loss=85.287, backward_time=0.297, grad_norm=97.082, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:15:38,103 (trainer:732) INFO: 11epoch:train:2253-2815batch: iter_time=2.532e-04, forward_time=0.201, loss_att=83.127, acc=0.923, loss=83.127, backward_time=0.295, grad_norm=90.833, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:21:53,865 (trainer:732) INFO: 11epoch:train:2816-3378batch: iter_time=2.731e-04, forward_time=0.202, loss_att=86.901, acc=0.923, loss=86.901, backward_time=0.297, grad_norm=99.257, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:28:09,489 (trainer:732) INFO: 11epoch:train:3379-3941batch: iter_time=2.557e-04, forward_time=0.202, loss_att=83.696, acc=0.925, loss=83.696, backward_time=0.297, grad_norm=100.650, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:34:25,196 (trainer:732) INFO: 11epoch:train:3942-4504batch: iter_time=2.626e-04, forward_time=0.202, loss_att=83.113, acc=0.926, loss=83.113, backward_time=0.298, grad_norm=96.575, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:40:40,421 (trainer:732) INFO: 11epoch:train:4505-5067batch: iter_time=2.684e-04, forward_time=0.202, loss_att=82.836, acc=0.924, loss=82.836, backward_time=0.297, grad_norm=97.187, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:46:56,059 (trainer:732) INFO: 11epoch:train:5068-5630batch: iter_time=2.664e-04, forward_time=0.202, loss_att=82.463, acc=0.926, loss=82.463, backward_time=0.297, grad_norm=98.638, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:53:12,275 (trainer:732) INFO: 11epoch:train:5631-6193batch: iter_time=2.631e-04, forward_time=0.202, loss_att=83.825, acc=0.926, loss=83.825, backward_time=0.298, grad_norm=94.630, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 16:59:28,513 (trainer:732) INFO: 11epoch:train:6194-6756batch: iter_time=2.608e-04, forward_time=0.203, loss_att=84.303, acc=0.925, loss=84.303, backward_time=0.298, grad_norm=99.462, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:05:41,678 (trainer:732) INFO: 11epoch:train:6757-7319batch: iter_time=2.650e-04, forward_time=0.201, loss_att=79.007, acc=0.926, loss=79.007, backward_time=0.295, grad_norm=98.117, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:11:56,615 (trainer:732) INFO: 11epoch:train:7320-7882batch: iter_time=2.552e-04, forward_time=0.202, loss_att=82.916, acc=0.925, loss=82.916, backward_time=0.297, grad_norm=100.695, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:18:11,558 (trainer:732) INFO: 11epoch:train:7883-8445batch: iter_time=2.526e-04, forward_time=0.203, loss_att=81.654, acc=0.925, loss=81.654, backward_time=0.297, grad_norm=92.247, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:24:25,553 (trainer:732) INFO: 11epoch:train:8446-9008batch: iter_time=2.626e-04, forward_time=0.202, loss_att=83.635, acc=0.925, loss=83.635, backward_time=0.296, grad_norm=93.890, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:30:40,061 (trainer:732) INFO: 11epoch:train:9009-9571batch: iter_time=2.655e-04, forward_time=0.202, loss_att=81.930, acc=0.926, loss=81.930, backward_time=0.297, grad_norm=100.534, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:36:55,232 (trainer:732) INFO: 11epoch:train:9572-10134batch: iter_time=2.628e-04, forward_time=0.202, loss_att=82.276, acc=0.926, loss=82.276, backward_time=0.297, grad_norm=95.904, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:43:11,154 (trainer:732) INFO: 11epoch:train:10135-10697batch: iter_time=2.499e-04, forward_time=0.202, loss_att=81.478, acc=0.927, loss=81.478, backward_time=0.297, grad_norm=96.662, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:49:27,125 (trainer:732) INFO: 11epoch:train:10698-11260batch: iter_time=2.521e-04, forward_time=0.202, loss_att=83.104, acc=0.925, loss=83.104, backward_time=0.298, grad_norm=96.185, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:57:26,108 (trainer:338) INFO: 11epoch results: [train] iter_time=3.789e-04, forward_time=0.202, loss_att=83.198, acc=0.925, loss=83.198, backward_time=0.297, grad_norm=96.710, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.720, time=2 hours, 7 minutes and 51.97 seconds, total_count=123992, gpu_max_cached_mem_GB=30.271, [valid] loss_att=16.177, acc=0.972, cer=0.035, wer=0.131, loss=16.177, time=4 minutes and 33.83 seconds, total_count=616, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 12.37 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:57:29,754 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:57:29,781 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/1epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 17:57:29,781 (trainer:272) INFO: 12/60epoch started. Estimated time to finish: 4 days, 16 hours and 22 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:06:21,401 (trainer:732) INFO: 12epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=80.711, acc=0.929, loss=80.711, backward_time=0.299, grad_norm=99.316, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=3.783 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:12:35,492 (trainer:732) INFO: 12epoch:train:564-1126batch: iter_time=2.510e-04, forward_time=0.201, loss_att=78.325, acc=0.928, loss=78.325, backward_time=0.296, grad_norm=93.828, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:18:50,426 (trainer:732) INFO: 12epoch:train:1127-1689batch: iter_time=2.452e-04, forward_time=0.202, loss_att=80.891, acc=0.928, loss=80.891, backward_time=0.297, grad_norm=99.582, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:25:05,490 (trainer:732) INFO: 12epoch:train:1690-2252batch: iter_time=2.387e-04, forward_time=0.201, loss_att=79.067, acc=0.929, loss=79.067, backward_time=0.297, grad_norm=99.600, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:31:20,255 (trainer:732) INFO: 12epoch:train:2253-2815batch: iter_time=2.344e-04, forward_time=0.201, loss_att=78.535, acc=0.929, loss=78.535, backward_time=0.297, grad_norm=94.280, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:37:32,345 (trainer:732) INFO: 12epoch:train:2816-3378batch: iter_time=2.588e-04, forward_time=0.200, loss_att=77.764, acc=0.927, loss=77.764, backward_time=0.294, grad_norm=93.286, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.644 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:43:47,906 (trainer:732) INFO: 12epoch:train:3379-3941batch: iter_time=2.389e-04, forward_time=0.202, loss_att=79.283, acc=0.929, loss=79.283, backward_time=0.297, grad_norm=98.291, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:50:02,044 (trainer:732) INFO: 12epoch:train:3942-4504batch: iter_time=2.462e-04, forward_time=0.201, loss_att=78.295, acc=0.929, loss=78.295, backward_time=0.296, grad_norm=94.021, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 18:56:16,972 (trainer:732) INFO: 12epoch:train:4505-5067batch: iter_time=2.424e-04, forward_time=0.202, loss_att=79.738, acc=0.929, loss=79.738, backward_time=0.297, grad_norm=103.114, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:02:30,926 (trainer:732) INFO: 12epoch:train:5068-5630batch: iter_time=2.425e-04, forward_time=0.201, loss_att=78.305, acc=0.929, loss=78.305, backward_time=0.296, grad_norm=95.417, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:08:45,870 (trainer:732) INFO: 12epoch:train:5631-6193batch: iter_time=2.472e-04, forward_time=0.202, loss_att=78.483, acc=0.929, loss=78.483, backward_time=0.297, grad_norm=99.333, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:15:02,355 (trainer:732) INFO: 12epoch:train:6194-6756batch: iter_time=2.398e-04, forward_time=0.202, loss_att=79.630, acc=0.929, loss=79.630, backward_time=0.298, grad_norm=96.199, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:21:16,466 (trainer:732) INFO: 12epoch:train:6757-7319batch: iter_time=2.387e-04, forward_time=0.201, loss_att=78.128, acc=0.928, loss=78.128, backward_time=0.297, grad_norm=92.555, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:27:31,920 (trainer:732) INFO: 12epoch:train:7320-7882batch: iter_time=2.397e-04, forward_time=0.202, loss_att=75.950, acc=0.931, loss=75.950, backward_time=0.297, grad_norm=98.213, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:33:46,676 (trainer:732) INFO: 12epoch:train:7883-8445batch: iter_time=2.383e-04, forward_time=0.202, loss_att=76.810, acc=0.930, loss=76.810, backward_time=0.297, grad_norm=96.307, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:40:03,092 (trainer:732) INFO: 12epoch:train:8446-9008batch: iter_time=2.471e-04, forward_time=0.202, loss_att=76.821, acc=0.930, loss=76.821, backward_time=0.298, grad_norm=92.456, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:46:19,377 (trainer:732) INFO: 12epoch:train:9009-9571batch: iter_time=2.487e-04, forward_time=0.203, loss_att=78.463, acc=0.930, loss=78.463, backward_time=0.298, grad_norm=104.124, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:52:36,346 (trainer:732) INFO: 12epoch:train:9572-10134batch: iter_time=2.433e-04, forward_time=0.203, loss_att=79.938, acc=0.930, loss=79.938, backward_time=0.299, grad_norm=106.884, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 19:58:50,819 (trainer:732) INFO: 12epoch:train:10135-10697batch: iter_time=2.446e-04, forward_time=0.202, loss_att=76.945, acc=0.930, loss=76.945, backward_time=0.296, grad_norm=95.156, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:05:08,195 (trainer:732) INFO: 12epoch:train:10698-11260batch: iter_time=2.422e-04, forward_time=0.203, loss_att=78.663, acc=0.931, loss=78.663, backward_time=0.299, grad_norm=101.035, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:13:06,336 (trainer:338) INFO: 12epoch results: [train] iter_time=3.426e-04, forward_time=0.202, loss_att=78.533, acc=0.929, loss=78.533, backward_time=0.297, grad_norm=97.658, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.720, time=2 hours, 7 minutes and 51.02 seconds, total_count=135264, gpu_max_cached_mem_GB=30.271, [valid] loss_att=15.437, acc=0.973, cer=0.034, wer=0.126, loss=15.437, time=4 minutes and 34.85 seconds, total_count=672, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.68 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:13:13,400 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:13:13,408 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/2epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:13:13,408 (trainer:272) INFO: 13/60epoch started. Estimated time to finish: 4 days, 13 hours and 57 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:22:03,263 (trainer:732) INFO: 13epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=74.727, acc=0.933, loss=74.727, backward_time=0.298, grad_norm=94.485, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=3.771 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:28:18,452 (trainer:732) INFO: 13epoch:train:564-1126batch: iter_time=2.342e-04, forward_time=0.202, loss_att=76.676, acc=0.931, loss=76.676, backward_time=0.297, grad_norm=92.980, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:34:36,797 (trainer:732) INFO: 13epoch:train:1127-1689batch: iter_time=2.391e-04, forward_time=0.203, loss_att=76.697, acc=0.933, loss=76.697, backward_time=0.300, grad_norm=94.668, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:40:52,936 (trainer:732) INFO: 13epoch:train:1690-2252batch: iter_time=2.338e-04, forward_time=0.202, loss_att=75.790, acc=0.932, loss=75.790, backward_time=0.298, grad_norm=91.915, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:47:07,719 (trainer:732) INFO: 13epoch:train:2253-2815batch: iter_time=2.275e-04, forward_time=0.202, loss_att=75.088, acc=0.932, loss=75.088, backward_time=0.297, grad_norm=98.999, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:53:21,425 (trainer:732) INFO: 13epoch:train:2816-3378batch: iter_time=2.271e-04, forward_time=0.201, loss_att=74.768, acc=0.932, loss=74.768, backward_time=0.295, grad_norm=96.884, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 20:59:36,984 (trainer:732) INFO: 13epoch:train:3379-3941batch: iter_time=2.450e-04, forward_time=0.202, loss_att=74.795, acc=0.932, loss=74.795, backward_time=0.297, grad_norm=94.078, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:05:55,492 (trainer:732) INFO: 13epoch:train:3942-4504batch: iter_time=2.319e-04, forward_time=0.203, loss_att=76.913, acc=0.932, loss=76.913, backward_time=0.299, grad_norm=95.516, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:12:11,960 (trainer:732) INFO: 13epoch:train:4505-5067batch: iter_time=2.225e-04, forward_time=0.203, loss_att=76.207, acc=0.932, loss=76.207, backward_time=0.298, grad_norm=96.011, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.002, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:18:27,390 (trainer:732) INFO: 13epoch:train:5068-5630batch: iter_time=2.337e-04, forward_time=0.202, loss_att=75.445, acc=0.932, loss=75.445, backward_time=0.297, grad_norm=97.832, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:24:43,348 (trainer:732) INFO: 13epoch:train:5631-6193batch: iter_time=2.432e-04, forward_time=0.202, loss_att=74.913, acc=0.932, loss=74.913, backward_time=0.297, grad_norm=97.799, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:30:57,598 (trainer:732) INFO: 13epoch:train:6194-6756batch: iter_time=2.304e-04, forward_time=0.201, loss_att=74.947, acc=0.932, loss=74.947, backward_time=0.296, grad_norm=105.455, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:37:12,872 (trainer:732) INFO: 13epoch:train:6757-7319batch: iter_time=2.353e-04, forward_time=0.202, loss_att=75.185, acc=0.933, loss=75.185, backward_time=0.297, grad_norm=90.713, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:43:28,273 (trainer:732) INFO: 13epoch:train:7320-7882batch: iter_time=2.307e-04, forward_time=0.202, loss_att=75.971, acc=0.933, loss=75.971, backward_time=0.297, grad_norm=100.929, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:49:41,350 (trainer:732) INFO: 13epoch:train:7883-8445batch: iter_time=2.358e-04, forward_time=0.200, loss_att=72.556, acc=0.933, loss=72.556, backward_time=0.295, grad_norm=94.126, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 21:55:55,787 (trainer:732) INFO: 13epoch:train:8446-9008batch: iter_time=2.365e-04, forward_time=0.201, loss_att=73.435, acc=0.933, loss=73.435, backward_time=0.296, grad_norm=92.891, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:02:10,415 (trainer:732) INFO: 13epoch:train:9009-9571batch: iter_time=2.305e-04, forward_time=0.202, loss_att=74.090, acc=0.932, loss=74.090, backward_time=0.297, grad_norm=90.824, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:08:22,681 (trainer:732) INFO: 13epoch:train:9572-10134batch: iter_time=2.368e-04, forward_time=0.200, loss_att=71.545, acc=0.933, loss=71.545, backward_time=0.294, grad_norm=88.179, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.644 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:14:37,423 (trainer:732) INFO: 13epoch:train:10135-10697batch: iter_time=2.394e-04, forward_time=0.202, loss_att=73.091, acc=0.933, loss=73.091, backward_time=0.296, grad_norm=93.967, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:20:52,833 (trainer:732) INFO: 13epoch:train:10698-11260batch: iter_time=2.304e-04, forward_time=0.202, loss_att=73.536, acc=0.934, loss=73.536, backward_time=0.297, grad_norm=96.403, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:29:06,297 (trainer:338) INFO: 13epoch results: [train] iter_time=3.225e-04, forward_time=0.202, loss_att=74.797, acc=0.932, loss=74.797, backward_time=0.297, grad_norm=95.232, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.721, time=2 hours, 7 minutes and 53.09 seconds, total_count=146536, gpu_max_cached_mem_GB=30.271, [valid] loss_att=14.830, acc=0.974, cer=0.031, wer=0.118, loss=14.830, time=4 minutes and 48.15 seconds, total_count=728, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 11.64 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:29:10,971 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:29:10,998 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/3epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:29:10,998 (trainer:272) INFO: 14/60epoch started. Estimated time to finish: 4 days, 11 hours and 34 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:38:01,684 (trainer:732) INFO: 14epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=70.984, acc=0.936, loss=70.984, backward_time=0.297, grad_norm=92.832, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.777 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:44:16,947 (trainer:732) INFO: 14epoch:train:564-1126batch: iter_time=2.635e-04, forward_time=0.202, loss_att=70.835, acc=0.936, loss=70.835, backward_time=0.297, grad_norm=90.214, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:50:33,438 (trainer:732) INFO: 14epoch:train:1127-1689batch: iter_time=2.604e-04, forward_time=0.203, loss_att=72.203, acc=0.935, loss=72.203, backward_time=0.298, grad_norm=92.761, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 22:56:50,154 (trainer:732) INFO: 14epoch:train:1690-2252batch: iter_time=2.515e-04, forward_time=0.203, loss_att=71.583, acc=0.935, loss=71.583, backward_time=0.298, grad_norm=95.170, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:03:08,973 (trainer:732) INFO: 14epoch:train:2253-2815batch: iter_time=2.574e-04, forward_time=0.203, loss_att=74.342, acc=0.935, loss=74.342, backward_time=0.299, grad_norm=92.637, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:09:23,995 (trainer:732) INFO: 14epoch:train:2816-3378batch: iter_time=2.612e-04, forward_time=0.202, loss_att=71.479, acc=0.935, loss=71.479, backward_time=0.296, grad_norm=96.453, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:15:39,212 (trainer:732) INFO: 14epoch:train:3379-3941batch: iter_time=2.515e-04, forward_time=0.202, loss_att=71.101, acc=0.935, loss=71.101, backward_time=0.297, grad_norm=93.227, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:21:54,599 (trainer:732) INFO: 14epoch:train:3942-4504batch: iter_time=2.576e-04, forward_time=0.202, loss_att=71.910, acc=0.935, loss=71.910, backward_time=0.297, grad_norm=91.447, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:28:07,155 (trainer:732) INFO: 14epoch:train:4505-5067batch: iter_time=2.507e-04, forward_time=0.200, loss_att=70.896, acc=0.935, loss=70.896, backward_time=0.294, grad_norm=92.021, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=0.001, train_time=2.648 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:34:22,710 (trainer:732) INFO: 14epoch:train:5068-5630batch: iter_time=2.541e-04, forward_time=0.202, loss_att=72.465, acc=0.934, loss=72.465, backward_time=0.297, grad_norm=101.609, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:40:37,422 (trainer:732) INFO: 14epoch:train:5631-6193batch: iter_time=2.543e-04, forward_time=0.202, loss_att=71.300, acc=0.934, loss=71.300, backward_time=0.296, grad_norm=94.356, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:46:53,548 (trainer:732) INFO: 14epoch:train:6194-6756batch: iter_time=2.558e-04, forward_time=0.202, loss_att=74.154, acc=0.935, loss=74.154, backward_time=0.297, grad_norm=92.439, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:53:06,517 (trainer:732) INFO: 14epoch:train:6757-7319batch: iter_time=2.626e-04, forward_time=0.201, loss_att=68.922, acc=0.936, loss=68.922, backward_time=0.295, grad_norm=89.015, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-22 23:59:23,404 (trainer:732) INFO: 14epoch:train:7320-7882batch: iter_time=2.595e-04, forward_time=0.202, loss_att=70.408, acc=0.936, loss=70.408, backward_time=0.298, grad_norm=92.128, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:05:39,735 (trainer:732) INFO: 14epoch:train:7883-8445batch: iter_time=2.638e-04, forward_time=0.202, loss_att=72.078, acc=0.935, loss=72.078, backward_time=0.298, grad_norm=94.355, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:11:53,558 (trainer:732) INFO: 14epoch:train:8446-9008batch: iter_time=2.594e-04, forward_time=0.201, loss_att=69.910, acc=0.935, loss=69.910, backward_time=0.296, grad_norm=92.018, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:18:07,352 (trainer:732) INFO: 14epoch:train:9009-9571batch: iter_time=2.555e-04, forward_time=0.201, loss_att=69.548, acc=0.936, loss=69.548, backward_time=0.296, grad_norm=95.353, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:24:24,229 (trainer:732) INFO: 14epoch:train:9572-10134batch: iter_time=2.572e-04, forward_time=0.203, loss_att=70.593, acc=0.937, loss=70.593, backward_time=0.298, grad_norm=96.508, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:30:39,524 (trainer:732) INFO: 14epoch:train:10135-10697batch: iter_time=2.524e-04, forward_time=0.202, loss_att=70.397, acc=0.936, loss=70.397, backward_time=0.297, grad_norm=97.154, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:36:56,697 (trainer:732) INFO: 14epoch:train:10698-11260batch: iter_time=2.553e-04, forward_time=0.203, loss_att=72.707, acc=0.935, loss=72.707, backward_time=0.299, grad_norm=92.237, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:45:11,858 (trainer:338) INFO: 14epoch results: [train] iter_time=3.253e-04, forward_time=0.202, loss_att=71.374, acc=0.935, loss=71.374, backward_time=0.297, grad_norm=93.876, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.723, time=2 hours, 7 minutes and 58.86 seconds, total_count=157808, gpu_max_cached_mem_GB=30.271, [valid] loss_att=14.536, acc=0.975, cer=0.031, wer=0.116, loss=14.536, time=4 minutes and 50.36 seconds, total_count=784, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 11.64 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:45:15,940 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:45:15,967 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/4epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:45:15,968 (trainer:272) INFO: 15/60epoch started. Estimated time to finish: 4 days, 9 hours and 12 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 00:54:06,453 (trainer:732) INFO: 15epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=67.628, acc=0.938, loss=67.628, backward_time=0.296, grad_norm=94.311, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.776 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:00:23,276 (trainer:732) INFO: 15epoch:train:564-1126batch: iter_time=2.703e-04, forward_time=0.203, loss_att=70.508, acc=0.937, loss=70.508, backward_time=0.298, grad_norm=89.896, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:06:39,589 (trainer:732) INFO: 15epoch:train:1127-1689batch: iter_time=2.717e-04, forward_time=0.203, loss_att=69.910, acc=0.937, loss=69.910, backward_time=0.298, grad_norm=92.173, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:12:57,010 (trainer:732) INFO: 15epoch:train:1690-2252batch: iter_time=2.648e-04, forward_time=0.203, loss_att=71.301, acc=0.937, loss=71.301, backward_time=0.299, grad_norm=94.393, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:19:11,601 (trainer:732) INFO: 15epoch:train:2253-2815batch: iter_time=2.589e-04, forward_time=0.201, loss_att=68.585, acc=0.938, loss=68.585, backward_time=0.296, grad_norm=92.304, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:25:27,633 (trainer:732) INFO: 15epoch:train:2816-3378batch: iter_time=2.615e-04, forward_time=0.202, loss_att=68.950, acc=0.937, loss=68.950, backward_time=0.298, grad_norm=91.437, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:31:41,426 (trainer:732) INFO: 15epoch:train:3379-3941batch: iter_time=2.636e-04, forward_time=0.201, loss_att=67.401, acc=0.938, loss=67.401, backward_time=0.295, grad_norm=90.618, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:37:59,593 (trainer:732) INFO: 15epoch:train:3942-4504batch: iter_time=2.534e-04, forward_time=0.203, loss_att=71.278, acc=0.937, loss=71.278, backward_time=0.299, grad_norm=100.413, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:44:16,484 (trainer:732) INFO: 15epoch:train:4505-5067batch: iter_time=2.596e-04, forward_time=0.203, loss_att=68.276, acc=0.939, loss=68.276, backward_time=0.299, grad_norm=88.695, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:50:30,996 (trainer:732) INFO: 15epoch:train:5068-5630batch: iter_time=2.663e-04, forward_time=0.201, loss_att=67.802, acc=0.938, loss=67.802, backward_time=0.296, grad_norm=91.595, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 01:56:48,935 (trainer:732) INFO: 15epoch:train:5631-6193batch: iter_time=2.617e-04, forward_time=0.203, loss_att=69.004, acc=0.938, loss=69.004, backward_time=0.299, grad_norm=89.356, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:03:04,090 (trainer:732) INFO: 15epoch:train:6194-6756batch: iter_time=2.644e-04, forward_time=0.202, loss_att=67.114, acc=0.938, loss=67.114, backward_time=0.296, grad_norm=98.020, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:09:18,584 (trainer:732) INFO: 15epoch:train:6757-7319batch: iter_time=2.642e-04, forward_time=0.202, loss_att=66.840, acc=0.939, loss=66.840, backward_time=0.297, grad_norm=91.898, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:15:33,672 (trainer:732) INFO: 15epoch:train:7320-7882batch: iter_time=2.659e-04, forward_time=0.202, loss_att=67.311, acc=0.939, loss=67.311, backward_time=0.297, grad_norm=91.450, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:21:47,366 (trainer:732) INFO: 15epoch:train:7883-8445batch: iter_time=2.599e-04, forward_time=0.201, loss_att=67.519, acc=0.938, loss=67.519, backward_time=0.296, grad_norm=89.528, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:28:01,945 (trainer:732) INFO: 15epoch:train:8446-9008batch: iter_time=2.538e-04, forward_time=0.201, loss_att=68.290, acc=0.938, loss=68.290, backward_time=0.296, grad_norm=94.639, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:34:15,071 (trainer:732) INFO: 15epoch:train:9009-9571batch: iter_time=2.638e-04, forward_time=0.201, loss_att=67.389, acc=0.938, loss=67.389, backward_time=0.295, grad_norm=92.944, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:40:29,474 (trainer:732) INFO: 15epoch:train:9572-10134batch: iter_time=2.633e-04, forward_time=0.201, loss_att=68.073, acc=0.938, loss=68.073, backward_time=0.296, grad_norm=92.884, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:46:47,115 (trainer:732) INFO: 15epoch:train:10135-10697batch: iter_time=2.610e-04, forward_time=0.203, loss_att=70.947, acc=0.938, loss=70.947, backward_time=0.299, grad_norm=94.089, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 02:53:02,173 (trainer:732) INFO: 15epoch:train:10698-11260batch: iter_time=2.597e-04, forward_time=0.201, loss_att=68.052, acc=0.938, loss=68.052, backward_time=0.296, grad_norm=93.344, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:01:20,324 (trainer:338) INFO: 15epoch results: [train] iter_time=3.859e-04, forward_time=0.202, loss_att=68.592, acc=0.938, loss=68.592, backward_time=0.297, grad_norm=92.695, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.723, time=2 hours, 7 minutes and 59.21 seconds, total_count=169080, gpu_max_cached_mem_GB=30.271, [valid] loss_att=13.725, acc=0.976, cer=0.029, wer=0.110, loss=13.725, time=4 minutes and 52.19 seconds, total_count=840, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 12.95 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:01:24,494 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:01:24,521 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/5epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:01:24,521 (trainer:272) INFO: 16/60epoch started. Estimated time to finish: 4 days, 6 hours and 52 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:10:15,353 (trainer:732) INFO: 16epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=66.518, acc=0.941, loss=66.518, backward_time=0.298, grad_norm=96.375, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.777 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:16:32,173 (trainer:732) INFO: 16epoch:train:564-1126batch: iter_time=2.391e-04, forward_time=0.202, loss_att=66.847, acc=0.941, loss=66.847, backward_time=0.298, grad_norm=91.824, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:22:47,892 (trainer:732) INFO: 16epoch:train:1127-1689batch: iter_time=2.465e-04, forward_time=0.202, loss_att=66.220, acc=0.940, loss=66.220, backward_time=0.297, grad_norm=91.583, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:29:02,186 (trainer:732) INFO: 16epoch:train:1690-2252batch: iter_time=2.448e-04, forward_time=0.201, loss_att=66.423, acc=0.939, loss=66.423, backward_time=0.296, grad_norm=92.395, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:35:17,029 (trainer:732) INFO: 16epoch:train:2253-2815batch: iter_time=2.372e-04, forward_time=0.202, loss_att=66.783, acc=0.939, loss=66.783, backward_time=0.297, grad_norm=86.430, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:41:31,640 (trainer:732) INFO: 16epoch:train:2816-3378batch: iter_time=2.444e-04, forward_time=0.202, loss_att=67.100, acc=0.939, loss=67.100, backward_time=0.297, grad_norm=91.333, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:47:46,587 (trainer:732) INFO: 16epoch:train:3379-3941batch: iter_time=2.487e-04, forward_time=0.202, loss_att=66.834, acc=0.940, loss=66.834, backward_time=0.297, grad_norm=91.533, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 03:54:01,740 (trainer:732) INFO: 16epoch:train:3942-4504batch: iter_time=2.399e-04, forward_time=0.202, loss_att=67.079, acc=0.939, loss=67.079, backward_time=0.297, grad_norm=92.336, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:00:17,815 (trainer:732) INFO: 16epoch:train:4505-5067batch: iter_time=2.326e-04, forward_time=0.202, loss_att=65.138, acc=0.941, loss=65.138, backward_time=0.298, grad_norm=91.279, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:06:33,142 (trainer:732) INFO: 16epoch:train:5068-5630batch: iter_time=2.367e-04, forward_time=0.201, loss_att=64.632, acc=0.941, loss=64.632, backward_time=0.297, grad_norm=86.660, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:12:45,162 (trainer:732) INFO: 16epoch:train:5631-6193batch: iter_time=2.346e-04, forward_time=0.200, loss_att=63.332, acc=0.940, loss=63.332, backward_time=0.293, grad_norm=89.422, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:19:00,418 (trainer:732) INFO: 16epoch:train:6194-6756batch: iter_time=2.394e-04, forward_time=0.202, loss_att=67.297, acc=0.940, loss=67.297, backward_time=0.296, grad_norm=94.170, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:25:15,951 (trainer:732) INFO: 16epoch:train:6757-7319batch: iter_time=2.381e-04, forward_time=0.202, loss_att=67.954, acc=0.940, loss=67.954, backward_time=0.297, grad_norm=93.743, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:31:29,019 (trainer:732) INFO: 16epoch:train:7320-7882batch: iter_time=2.370e-04, forward_time=0.201, loss_att=64.904, acc=0.940, loss=64.904, backward_time=0.295, grad_norm=84.222, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:37:47,158 (trainer:732) INFO: 16epoch:train:7883-8445batch: iter_time=2.366e-04, forward_time=0.203, loss_att=67.138, acc=0.941, loss=67.138, backward_time=0.300, grad_norm=88.122, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:44:04,113 (trainer:732) INFO: 16epoch:train:8446-9008batch: iter_time=2.405e-04, forward_time=0.203, loss_att=65.219, acc=0.941, loss=65.219, backward_time=0.298, grad_norm=92.438, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:50:18,717 (trainer:732) INFO: 16epoch:train:9009-9571batch: iter_time=2.303e-04, forward_time=0.202, loss_att=65.139, acc=0.940, loss=65.139, backward_time=0.296, grad_norm=95.537, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 04:56:35,137 (trainer:732) INFO: 16epoch:train:9572-10134batch: iter_time=2.325e-04, forward_time=0.203, loss_att=66.929, acc=0.939, loss=66.929, backward_time=0.298, grad_norm=94.237, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:02:50,640 (trainer:732) INFO: 16epoch:train:10135-10697batch: iter_time=2.324e-04, forward_time=0.202, loss_att=66.479, acc=0.940, loss=66.479, backward_time=0.297, grad_norm=90.295, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:09:06,085 (trainer:732) INFO: 16epoch:train:10698-11260batch: iter_time=2.347e-04, forward_time=0.202, loss_att=64.678, acc=0.942, loss=64.678, backward_time=0.297, grad_norm=94.186, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:17:27,970 (trainer:338) INFO: 16epoch results: [train] iter_time=3.246e-04, forward_time=0.202, loss_att=66.119, acc=0.940, loss=66.119, backward_time=0.297, grad_norm=91.416, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 55.33 seconds, total_count=180352, gpu_max_cached_mem_GB=30.271, [valid] loss_att=13.988, acc=0.975, cer=0.029, wer=0.109, loss=13.988, time=4 minutes and 56.06 seconds, total_count=896, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 12.06 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:17:31,703 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:17:31,713 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/6epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:17:31,713 (trainer:272) INFO: 17/60epoch started. Estimated time to finish: 4 days, 4 hours and 32 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:26:22,375 (trainer:732) INFO: 17epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=64.362, acc=0.942, loss=64.362, backward_time=0.298, grad_norm=89.921, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=0.001, train_time=3.777 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:32:37,542 (trainer:732) INFO: 17epoch:train:564-1126batch: iter_time=2.620e-04, forward_time=0.202, loss_att=63.967, acc=0.942, loss=63.967, backward_time=0.296, grad_norm=91.037, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:38:53,514 (trainer:732) INFO: 17epoch:train:1127-1689batch: iter_time=2.585e-04, forward_time=0.202, loss_att=64.394, acc=0.942, loss=64.394, backward_time=0.297, grad_norm=90.754, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:45:07,738 (trainer:732) INFO: 17epoch:train:1690-2252batch: iter_time=2.638e-04, forward_time=0.202, loss_att=62.953, acc=0.942, loss=62.953, backward_time=0.296, grad_norm=94.420, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:51:22,589 (trainer:732) INFO: 17epoch:train:2253-2815batch: iter_time=2.591e-04, forward_time=0.201, loss_att=62.956, acc=0.943, loss=62.956, backward_time=0.297, grad_norm=92.530, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 05:57:37,356 (trainer:732) INFO: 17epoch:train:2816-3378batch: iter_time=2.590e-04, forward_time=0.202, loss_att=63.549, acc=0.942, loss=63.549, backward_time=0.296, grad_norm=93.202, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:03:52,825 (trainer:732) INFO: 17epoch:train:3379-3941batch: iter_time=2.660e-04, forward_time=0.202, loss_att=64.418, acc=0.942, loss=64.418, backward_time=0.297, grad_norm=95.249, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:10:11,643 (trainer:732) INFO: 17epoch:train:3942-4504batch: iter_time=2.537e-04, forward_time=0.204, loss_att=66.065, acc=0.942, loss=66.065, backward_time=0.300, grad_norm=93.530, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:16:25,213 (trainer:732) INFO: 17epoch:train:4505-5067batch: iter_time=2.595e-04, forward_time=0.201, loss_att=63.647, acc=0.941, loss=63.647, backward_time=0.296, grad_norm=87.093, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:22:41,843 (trainer:732) INFO: 17epoch:train:5068-5630batch: iter_time=2.524e-04, forward_time=0.203, loss_att=64.517, acc=0.942, loss=64.517, backward_time=0.299, grad_norm=92.800, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:28:56,815 (trainer:732) INFO: 17epoch:train:5631-6193batch: iter_time=2.628e-04, forward_time=0.202, loss_att=63.568, acc=0.943, loss=63.568, backward_time=0.297, grad_norm=91.771, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:35:12,204 (trainer:732) INFO: 17epoch:train:6194-6756batch: iter_time=2.647e-04, forward_time=0.202, loss_att=63.833, acc=0.942, loss=63.833, backward_time=0.297, grad_norm=92.148, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:41:27,082 (trainer:732) INFO: 17epoch:train:6757-7319batch: iter_time=2.654e-04, forward_time=0.202, loss_att=63.694, acc=0.942, loss=63.694, backward_time=0.297, grad_norm=87.985, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:47:42,020 (trainer:732) INFO: 17epoch:train:7320-7882batch: iter_time=2.516e-04, forward_time=0.202, loss_att=64.679, acc=0.942, loss=64.679, backward_time=0.297, grad_norm=90.238, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 06:53:54,637 (trainer:732) INFO: 17epoch:train:7883-8445batch: iter_time=2.555e-04, forward_time=0.201, loss_att=62.104, acc=0.942, loss=62.104, backward_time=0.294, grad_norm=87.739, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.646 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:00:09,576 (trainer:732) INFO: 17epoch:train:8446-9008batch: iter_time=2.525e-04, forward_time=0.202, loss_att=63.498, acc=0.942, loss=63.498, backward_time=0.296, grad_norm=88.560, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:06:24,441 (trainer:732) INFO: 17epoch:train:9009-9571batch: iter_time=2.546e-04, forward_time=0.202, loss_att=64.686, acc=0.941, loss=64.686, backward_time=0.297, grad_norm=92.626, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:12:41,683 (trainer:732) INFO: 17epoch:train:9572-10134batch: iter_time=2.527e-04, forward_time=0.203, loss_att=64.054, acc=0.943, loss=64.054, backward_time=0.299, grad_norm=94.045, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:18:56,612 (trainer:732) INFO: 17epoch:train:10135-10697batch: iter_time=2.542e-04, forward_time=0.201, loss_att=62.264, acc=0.943, loss=62.264, backward_time=0.296, grad_norm=91.306, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:25:12,216 (trainer:732) INFO: 17epoch:train:10698-11260batch: iter_time=2.491e-04, forward_time=0.203, loss_att=64.260, acc=0.942, loss=64.260, backward_time=0.297, grad_norm=90.844, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:33:13,741 (trainer:338) INFO: 17epoch results: [train] iter_time=3.708e-04, forward_time=0.202, loss_att=63.857, acc=0.942, loss=63.857, backward_time=0.297, grad_norm=91.419, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 53.21 seconds, total_count=191624, gpu_max_cached_mem_GB=30.271, [valid] loss_att=12.866, acc=0.978, cer=0.028, wer=0.105, loss=12.866, time=4 minutes and 39.65 seconds, total_count=952, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 9.16 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:33:17,378 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:33:17,404 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/7epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:33:17,404 (trainer:272) INFO: 18/60epoch started. Estimated time to finish: 4 days, 2 hours and 11 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:42:07,746 (trainer:732) INFO: 18epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=60.373, acc=0.944, loss=60.373, backward_time=0.296, grad_norm=85.758, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.774 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:48:23,904 (trainer:732) INFO: 18epoch:train:564-1126batch: iter_time=2.485e-04, forward_time=0.203, loss_att=63.285, acc=0.944, loss=63.285, backward_time=0.298, grad_norm=86.176, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 07:54:39,964 (trainer:732) INFO: 18epoch:train:1127-1689batch: iter_time=2.697e-04, forward_time=0.202, loss_att=62.588, acc=0.944, loss=62.588, backward_time=0.297, grad_norm=91.274, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:00:54,705 (trainer:732) INFO: 18epoch:train:1690-2252batch: iter_time=2.580e-04, forward_time=0.202, loss_att=62.432, acc=0.943, loss=62.432, backward_time=0.297, grad_norm=91.468, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:07:09,321 (trainer:732) INFO: 18epoch:train:2253-2815batch: iter_time=2.583e-04, forward_time=0.202, loss_att=61.944, acc=0.944, loss=61.944, backward_time=0.296, grad_norm=88.605, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:13:23,777 (trainer:732) INFO: 18epoch:train:2816-3378batch: iter_time=2.663e-04, forward_time=0.202, loss_att=60.475, acc=0.944, loss=60.475, backward_time=0.297, grad_norm=96.252, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:19:40,025 (trainer:732) INFO: 18epoch:train:3379-3941batch: iter_time=2.581e-04, forward_time=0.203, loss_att=64.494, acc=0.943, loss=64.494, backward_time=0.298, grad_norm=91.446, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:25:55,886 (trainer:732) INFO: 18epoch:train:3942-4504batch: iter_time=2.567e-04, forward_time=0.202, loss_att=61.920, acc=0.944, loss=61.920, backward_time=0.297, grad_norm=90.921, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:32:09,897 (trainer:732) INFO: 18epoch:train:4505-5067batch: iter_time=2.588e-04, forward_time=0.201, loss_att=62.242, acc=0.943, loss=62.242, backward_time=0.296, grad_norm=100.334, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:38:27,726 (trainer:732) INFO: 18epoch:train:5068-5630batch: iter_time=2.582e-04, forward_time=0.204, loss_att=63.800, acc=0.944, loss=63.800, backward_time=0.300, grad_norm=99.700, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:44:41,545 (trainer:732) INFO: 18epoch:train:5631-6193batch: iter_time=2.532e-04, forward_time=0.201, loss_att=62.128, acc=0.944, loss=62.128, backward_time=0.296, grad_norm=89.072, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:50:56,698 (trainer:732) INFO: 18epoch:train:6194-6756batch: iter_time=2.512e-04, forward_time=0.202, loss_att=61.969, acc=0.944, loss=61.969, backward_time=0.297, grad_norm=95.054, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 08:57:11,638 (trainer:732) INFO: 18epoch:train:6757-7319batch: iter_time=2.495e-04, forward_time=0.202, loss_att=61.371, acc=0.944, loss=61.371, backward_time=0.297, grad_norm=97.388, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:03:28,077 (trainer:732) INFO: 18epoch:train:7320-7882batch: iter_time=2.450e-04, forward_time=0.202, loss_att=61.941, acc=0.945, loss=61.941, backward_time=0.298, grad_norm=92.195, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:09:42,732 (trainer:732) INFO: 18epoch:train:7883-8445batch: iter_time=2.452e-04, forward_time=0.202, loss_att=60.609, acc=0.945, loss=60.609, backward_time=0.296, grad_norm=92.990, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:15:56,519 (trainer:732) INFO: 18epoch:train:8446-9008batch: iter_time=2.429e-04, forward_time=0.201, loss_att=60.402, acc=0.944, loss=60.402, backward_time=0.296, grad_norm=88.447, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:22:11,071 (trainer:732) INFO: 18epoch:train:9009-9571batch: iter_time=2.532e-04, forward_time=0.202, loss_att=62.131, acc=0.943, loss=62.131, backward_time=0.296, grad_norm=85.572, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:28:25,948 (trainer:732) INFO: 18epoch:train:9572-10134batch: iter_time=2.568e-04, forward_time=0.202, loss_att=61.652, acc=0.944, loss=61.652, backward_time=0.297, grad_norm=89.603, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:34:41,755 (trainer:732) INFO: 18epoch:train:10135-10697batch: iter_time=2.524e-04, forward_time=0.202, loss_att=63.387, acc=0.944, loss=63.387, backward_time=0.297, grad_norm=91.607, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:40:57,721 (trainer:732) INFO: 18epoch:train:10698-11260batch: iter_time=2.456e-04, forward_time=0.202, loss_att=60.208, acc=0.944, loss=60.208, backward_time=0.298, grad_norm=92.957, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:49:04,935 (trainer:338) INFO: 18epoch results: [train] iter_time=3.585e-04, forward_time=0.202, loss_att=61.954, acc=0.944, loss=61.954, backward_time=0.297, grad_norm=91.851, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 53.44 seconds, total_count=202896, gpu_max_cached_mem_GB=30.271, [valid] loss_att=12.673, acc=0.978, cer=0.027, wer=0.102, loss=12.673, time=4 minutes and 39.38 seconds, total_count=1008, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:49:08,706 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:49:08,732 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/8epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:49:08,732 (trainer:272) INFO: 19/60epoch started. Estimated time to finish: 3 days, 23 hours and 52 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 09:57:57,628 (trainer:732) INFO: 19epoch:train:1-563batch: iter_time=0.002, forward_time=0.201, loss_att=57.857, acc=0.947, loss=57.857, backward_time=0.297, grad_norm=92.854, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.764 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:04:13,439 (trainer:732) INFO: 19epoch:train:564-1126batch: iter_time=2.408e-04, forward_time=0.203, loss_att=61.359, acc=0.945, loss=61.359, backward_time=0.298, grad_norm=98.056, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:10:30,307 (trainer:732) INFO: 19epoch:train:1127-1689batch: iter_time=2.409e-04, forward_time=0.203, loss_att=61.273, acc=0.945, loss=61.273, backward_time=0.299, grad_norm=88.206, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:16:45,656 (trainer:732) INFO: 19epoch:train:1690-2252batch: iter_time=2.392e-04, forward_time=0.202, loss_att=59.497, acc=0.945, loss=59.497, backward_time=0.297, grad_norm=91.066, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:22:58,205 (trainer:732) INFO: 19epoch:train:2253-2815batch: iter_time=2.482e-04, forward_time=0.200, loss_att=58.530, acc=0.946, loss=58.530, backward_time=0.294, grad_norm=82.616, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.647 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:29:14,202 (trainer:732) INFO: 19epoch:train:2816-3378batch: iter_time=2.393e-04, forward_time=0.202, loss_att=61.506, acc=0.945, loss=61.506, backward_time=0.297, grad_norm=86.352, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:35:32,198 (trainer:732) INFO: 19epoch:train:3379-3941batch: iter_time=2.461e-04, forward_time=0.204, loss_att=60.528, acc=0.946, loss=60.528, backward_time=0.299, grad_norm=91.065, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:41:46,194 (trainer:732) INFO: 19epoch:train:3942-4504batch: iter_time=2.373e-04, forward_time=0.201, loss_att=59.657, acc=0.945, loss=59.657, backward_time=0.296, grad_norm=88.630, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:48:01,570 (trainer:732) INFO: 19epoch:train:4505-5067batch: iter_time=2.400e-04, forward_time=0.202, loss_att=61.789, acc=0.945, loss=61.789, backward_time=0.297, grad_norm=89.152, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 10:54:15,551 (trainer:732) INFO: 19epoch:train:5068-5630batch: iter_time=2.426e-04, forward_time=0.201, loss_att=59.458, acc=0.946, loss=59.458, backward_time=0.295, grad_norm=88.930, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:00:32,313 (trainer:732) INFO: 19epoch:train:5631-6193batch: iter_time=2.374e-04, forward_time=0.203, loss_att=60.833, acc=0.945, loss=60.833, backward_time=0.298, grad_norm=97.181, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:06:48,498 (trainer:732) INFO: 19epoch:train:6194-6756batch: iter_time=2.295e-04, forward_time=0.202, loss_att=60.735, acc=0.945, loss=60.735, backward_time=0.298, grad_norm=93.272, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:13:01,802 (trainer:732) INFO: 19epoch:train:6757-7319batch: iter_time=2.342e-04, forward_time=0.201, loss_att=58.160, acc=0.947, loss=58.160, backward_time=0.296, grad_norm=86.677, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:19:18,721 (trainer:732) INFO: 19epoch:train:7320-7882batch: iter_time=2.326e-04, forward_time=0.203, loss_att=60.928, acc=0.945, loss=60.928, backward_time=0.299, grad_norm=89.536, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:25:33,754 (trainer:732) INFO: 19epoch:train:7883-8445batch: iter_time=2.410e-04, forward_time=0.202, loss_att=62.240, acc=0.944, loss=62.240, backward_time=0.297, grad_norm=89.302, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:31:51,407 (trainer:732) INFO: 19epoch:train:8446-9008batch: iter_time=2.380e-04, forward_time=0.203, loss_att=61.727, acc=0.945, loss=61.727, backward_time=0.299, grad_norm=89.506, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:38:04,373 (trainer:732) INFO: 19epoch:train:9009-9571batch: iter_time=2.389e-04, forward_time=0.201, loss_att=60.468, acc=0.945, loss=60.468, backward_time=0.296, grad_norm=86.006, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:44:18,928 (trainer:732) INFO: 19epoch:train:9572-10134batch: iter_time=2.355e-04, forward_time=0.201, loss_att=60.239, acc=0.945, loss=60.239, backward_time=0.296, grad_norm=92.337, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:50:32,784 (trainer:732) INFO: 19epoch:train:10135-10697batch: iter_time=2.368e-04, forward_time=0.201, loss_att=58.754, acc=0.945, loss=58.754, backward_time=0.296, grad_norm=82.119, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 11:56:46,843 (trainer:732) INFO: 19epoch:train:10698-11260batch: iter_time=2.387e-04, forward_time=0.201, loss_att=59.156, acc=0.946, loss=59.156, backward_time=0.296, grad_norm=88.291, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:04:48,023 (trainer:338) INFO: 19epoch results: [train] iter_time=3.331e-04, forward_time=0.202, loss_att=60.218, acc=0.945, loss=60.218, backward_time=0.297, grad_norm=89.566, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.720, time=2 hours, 7 minutes and 51.04 seconds, total_count=214168, gpu_max_cached_mem_GB=30.271, [valid] loss_att=12.405, acc=0.979, cer=0.026, wer=0.100, loss=12.405, time=4 minutes and 36.85 seconds, total_count=1064, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 11.4 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:04:51,565 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:04:51,621 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/9epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:04:51,621 (trainer:272) INFO: 20/60epoch started. Estimated time to finish: 3 days, 21 hours and 32 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:13:41,383 (trainer:732) INFO: 20epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=58.193, acc=0.947, loss=58.193, backward_time=0.296, grad_norm=87.015, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.770 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:19:56,676 (trainer:732) INFO: 20epoch:train:564-1126batch: iter_time=2.622e-04, forward_time=0.202, loss_att=58.209, acc=0.947, loss=58.209, backward_time=0.297, grad_norm=92.006, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:26:13,403 (trainer:732) INFO: 20epoch:train:1127-1689batch: iter_time=2.673e-04, forward_time=0.203, loss_att=58.342, acc=0.947, loss=58.342, backward_time=0.298, grad_norm=88.480, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:32:28,430 (trainer:732) INFO: 20epoch:train:1690-2252batch: iter_time=2.572e-04, forward_time=0.202, loss_att=56.861, acc=0.947, loss=56.861, backward_time=0.297, grad_norm=84.731, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:38:43,292 (trainer:732) INFO: 20epoch:train:2253-2815batch: iter_time=2.581e-04, forward_time=0.202, loss_att=58.590, acc=0.947, loss=58.590, backward_time=0.297, grad_norm=89.913, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:44:56,956 (trainer:732) INFO: 20epoch:train:2816-3378batch: iter_time=2.572e-04, forward_time=0.201, loss_att=58.739, acc=0.946, loss=58.739, backward_time=0.296, grad_norm=89.508, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:51:13,980 (trainer:732) INFO: 20epoch:train:3379-3941batch: iter_time=2.487e-04, forward_time=0.203, loss_att=59.386, acc=0.947, loss=59.386, backward_time=0.299, grad_norm=89.255, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 12:57:30,445 (trainer:732) INFO: 20epoch:train:3942-4504batch: iter_time=2.579e-04, forward_time=0.203, loss_att=59.118, acc=0.947, loss=59.118, backward_time=0.298, grad_norm=89.671, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:03:47,008 (trainer:732) INFO: 20epoch:train:4505-5067batch: iter_time=2.564e-04, forward_time=0.203, loss_att=58.936, acc=0.947, loss=58.936, backward_time=0.298, grad_norm=94.694, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:10:01,860 (trainer:732) INFO: 20epoch:train:5068-5630batch: iter_time=2.567e-04, forward_time=0.201, loss_att=59.399, acc=0.946, loss=59.399, backward_time=0.296, grad_norm=91.316, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:16:17,689 (trainer:732) INFO: 20epoch:train:5631-6193batch: iter_time=2.581e-04, forward_time=0.203, loss_att=58.852, acc=0.946, loss=58.852, backward_time=0.298, grad_norm=91.307, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:22:32,682 (trainer:732) INFO: 20epoch:train:6194-6756batch: iter_time=2.636e-04, forward_time=0.202, loss_att=58.649, acc=0.947, loss=58.649, backward_time=0.297, grad_norm=93.543, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:28:46,542 (trainer:732) INFO: 20epoch:train:6757-7319batch: iter_time=2.568e-04, forward_time=0.201, loss_att=58.669, acc=0.946, loss=58.669, backward_time=0.296, grad_norm=89.836, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:35:02,071 (trainer:732) INFO: 20epoch:train:7320-7882batch: iter_time=2.566e-04, forward_time=0.202, loss_att=58.732, acc=0.947, loss=58.732, backward_time=0.297, grad_norm=94.035, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:41:17,649 (trainer:732) INFO: 20epoch:train:7883-8445batch: iter_time=2.606e-04, forward_time=0.202, loss_att=58.345, acc=0.947, loss=58.345, backward_time=0.297, grad_norm=88.660, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:47:30,741 (trainer:732) INFO: 20epoch:train:8446-9008batch: iter_time=2.542e-04, forward_time=0.201, loss_att=57.210, acc=0.947, loss=57.210, backward_time=0.295, grad_norm=89.220, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 13:53:45,392 (trainer:732) INFO: 20epoch:train:9009-9571batch: iter_time=2.598e-04, forward_time=0.202, loss_att=58.943, acc=0.947, loss=58.943, backward_time=0.297, grad_norm=90.743, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:00:01,872 (trainer:732) INFO: 20epoch:train:9572-10134batch: iter_time=2.553e-04, forward_time=0.203, loss_att=59.659, acc=0.947, loss=59.659, backward_time=0.298, grad_norm=88.834, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:06:16,940 (trainer:732) INFO: 20epoch:train:10135-10697batch: iter_time=2.595e-04, forward_time=0.202, loss_att=58.840, acc=0.946, loss=58.840, backward_time=0.297, grad_norm=94.089, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:12:31,699 (trainer:732) INFO: 20epoch:train:10698-11260batch: iter_time=2.533e-04, forward_time=0.202, loss_att=57.753, acc=0.947, loss=57.753, backward_time=0.297, grad_norm=93.001, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:20:35,176 (trainer:338) INFO: 20epoch results: [train] iter_time=3.362e-04, forward_time=0.202, loss_att=58.569, acc=0.947, loss=58.569, backward_time=0.297, grad_norm=90.535, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 52.78 seconds, total_count=225440, gpu_max_cached_mem_GB=30.271, [valid] loss_att=12.073, acc=0.979, cer=0.026, wer=0.098, loss=12.073, time=4 minutes and 37.21 seconds, total_count=1120, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.56 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:20:38,828 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:20:38,856 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/10epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:20:38,856 (trainer:272) INFO: 21/60epoch started. Estimated time to finish: 3 days, 19 hours and 13 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:29:30,975 (trainer:732) INFO: 21epoch:train:1-563batch: iter_time=0.003, forward_time=0.204, loss_att=58.508, acc=0.949, loss=58.508, backward_time=0.300, grad_norm=92.656, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.787 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:35:44,224 (trainer:732) INFO: 21epoch:train:564-1126batch: iter_time=2.529e-04, forward_time=0.201, loss_att=57.344, acc=0.947, loss=57.344, backward_time=0.295, grad_norm=94.629, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:42:01,058 (trainer:732) INFO: 21epoch:train:1127-1689batch: iter_time=2.448e-04, forward_time=0.202, loss_att=55.791, acc=0.949, loss=55.791, backward_time=0.298, grad_norm=91.903, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:48:15,954 (trainer:732) INFO: 21epoch:train:1690-2252batch: iter_time=2.434e-04, forward_time=0.201, loss_att=56.809, acc=0.948, loss=56.809, backward_time=0.297, grad_norm=93.176, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 14:54:30,186 (trainer:732) INFO: 21epoch:train:2253-2815batch: iter_time=2.380e-04, forward_time=0.202, loss_att=56.559, acc=0.948, loss=56.559, backward_time=0.296, grad_norm=86.684, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:00:45,869 (trainer:732) INFO: 21epoch:train:2816-3378batch: iter_time=2.396e-04, forward_time=0.202, loss_att=57.750, acc=0.948, loss=57.750, backward_time=0.297, grad_norm=94.976, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:07:01,457 (trainer:732) INFO: 21epoch:train:3379-3941batch: iter_time=2.369e-04, forward_time=0.202, loss_att=56.458, acc=0.949, loss=56.458, backward_time=0.297, grad_norm=86.517, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:13:16,525 (trainer:732) INFO: 21epoch:train:3942-4504batch: iter_time=2.540e-04, forward_time=0.202, loss_att=58.243, acc=0.947, loss=58.243, backward_time=0.296, grad_norm=91.405, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:19:31,254 (trainer:732) INFO: 21epoch:train:4505-5067batch: iter_time=2.504e-04, forward_time=0.202, loss_att=56.195, acc=0.949, loss=56.195, backward_time=0.297, grad_norm=96.842, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:25:46,836 (trainer:732) INFO: 21epoch:train:5068-5630batch: iter_time=2.387e-04, forward_time=0.202, loss_att=58.540, acc=0.948, loss=58.540, backward_time=0.297, grad_norm=89.681, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:32:00,528 (trainer:732) INFO: 21epoch:train:5631-6193batch: iter_time=2.416e-04, forward_time=0.201, loss_att=55.215, acc=0.949, loss=55.215, backward_time=0.296, grad_norm=89.268, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:38:16,890 (trainer:732) INFO: 21epoch:train:6194-6756batch: iter_time=2.531e-04, forward_time=0.202, loss_att=58.299, acc=0.947, loss=58.299, backward_time=0.297, grad_norm=89.142, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:44:32,097 (trainer:732) INFO: 21epoch:train:6757-7319batch: iter_time=2.488e-04, forward_time=0.202, loss_att=57.660, acc=0.948, loss=57.660, backward_time=0.297, grad_norm=90.961, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:50:49,204 (trainer:732) INFO: 21epoch:train:7320-7882batch: iter_time=2.543e-04, forward_time=0.203, loss_att=58.933, acc=0.947, loss=58.933, backward_time=0.299, grad_norm=88.004, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 15:57:05,911 (trainer:732) INFO: 21epoch:train:7883-8445batch: iter_time=2.476e-04, forward_time=0.203, loss_att=57.916, acc=0.948, loss=57.916, backward_time=0.298, grad_norm=87.381, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:03:19,152 (trainer:732) INFO: 21epoch:train:8446-9008batch: iter_time=2.492e-04, forward_time=0.201, loss_att=57.147, acc=0.947, loss=57.147, backward_time=0.295, grad_norm=94.168, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:09:31,341 (trainer:732) INFO: 21epoch:train:9009-9571batch: iter_time=2.412e-04, forward_time=0.200, loss_att=56.109, acc=0.947, loss=56.109, backward_time=0.294, grad_norm=87.690, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.645 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:15:48,128 (trainer:732) INFO: 21epoch:train:9572-10134batch: iter_time=2.426e-04, forward_time=0.203, loss_att=57.765, acc=0.948, loss=57.765, backward_time=0.299, grad_norm=93.226, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:22:01,117 (trainer:732) INFO: 21epoch:train:10135-10697batch: iter_time=2.377e-04, forward_time=0.201, loss_att=55.784, acc=0.948, loss=55.784, backward_time=0.295, grad_norm=85.514, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:28:18,169 (trainer:732) INFO: 21epoch:train:10698-11260batch: iter_time=2.471e-04, forward_time=0.203, loss_att=57.593, acc=0.948, loss=57.593, backward_time=0.298, grad_norm=83.157, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:36:20,316 (trainer:338) INFO: 21epoch results: [train] iter_time=3.617e-04, forward_time=0.202, loss_att=57.217, acc=0.948, loss=57.217, backward_time=0.297, grad_norm=90.366, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 52.2 seconds, total_count=236712, gpu_max_cached_mem_GB=30.271, [valid] loss_att=12.153, acc=0.979, cer=0.026, wer=0.097, loss=12.153, time=4 minutes and 35.66 seconds, total_count=1176, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.6 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:36:23,945 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:36:23,976 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/11epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:36:23,976 (trainer:272) INFO: 22/60epoch started. Estimated time to finish: 3 days, 16 hours and 54 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:45:13,902 (trainer:732) INFO: 22epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=56.234, acc=0.949, loss=56.234, backward_time=0.297, grad_norm=89.314, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.771 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:51:29,688 (trainer:732) INFO: 22epoch:train:564-1126batch: iter_time=2.431e-04, forward_time=0.202, loss_att=56.435, acc=0.949, loss=56.435, backward_time=0.298, grad_norm=85.205, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 16:57:43,857 (trainer:732) INFO: 22epoch:train:1127-1689batch: iter_time=2.488e-04, forward_time=0.201, loss_att=54.515, acc=0.949, loss=54.515, backward_time=0.296, grad_norm=87.199, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:04:00,039 (trainer:732) INFO: 22epoch:train:1690-2252batch: iter_time=2.281e-04, forward_time=0.202, loss_att=56.302, acc=0.949, loss=56.302, backward_time=0.298, grad_norm=88.819, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:10:14,985 (trainer:732) INFO: 22epoch:train:2253-2815batch: iter_time=2.268e-04, forward_time=0.202, loss_att=55.231, acc=0.950, loss=55.231, backward_time=0.297, grad_norm=87.093, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:16:31,909 (trainer:732) INFO: 22epoch:train:2816-3378batch: iter_time=2.707e-04, forward_time=0.203, loss_att=56.922, acc=0.949, loss=56.922, backward_time=0.298, grad_norm=89.605, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:22:49,898 (trainer:732) INFO: 22epoch:train:3379-3941batch: iter_time=2.421e-04, forward_time=0.203, loss_att=58.506, acc=0.949, loss=58.506, backward_time=0.300, grad_norm=90.815, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:29:04,994 (trainer:732) INFO: 22epoch:train:3942-4504batch: iter_time=2.420e-04, forward_time=0.202, loss_att=55.931, acc=0.949, loss=55.931, backward_time=0.296, grad_norm=87.611, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:35:22,205 (trainer:732) INFO: 22epoch:train:4505-5067batch: iter_time=2.273e-04, forward_time=0.202, loss_att=58.069, acc=0.949, loss=58.069, backward_time=0.298, grad_norm=87.634, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:41:35,309 (trainer:732) INFO: 22epoch:train:5068-5630batch: iter_time=2.340e-04, forward_time=0.201, loss_att=53.815, acc=0.950, loss=53.815, backward_time=0.295, grad_norm=93.557, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:47:49,177 (trainer:732) INFO: 22epoch:train:5631-6193batch: iter_time=2.261e-04, forward_time=0.201, loss_att=55.058, acc=0.949, loss=55.058, backward_time=0.296, grad_norm=89.130, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 17:54:04,796 (trainer:732) INFO: 22epoch:train:6194-6756batch: iter_time=2.334e-04, forward_time=0.202, loss_att=56.695, acc=0.949, loss=56.695, backward_time=0.297, grad_norm=87.688, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:00:20,269 (trainer:732) INFO: 22epoch:train:6757-7319batch: iter_time=2.398e-04, forward_time=0.202, loss_att=56.745, acc=0.949, loss=56.745, backward_time=0.297, grad_norm=90.402, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:06:33,783 (trainer:732) INFO: 22epoch:train:7320-7882batch: iter_time=2.276e-04, forward_time=0.201, loss_att=55.382, acc=0.949, loss=55.382, backward_time=0.296, grad_norm=84.474, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:12:48,104 (trainer:732) INFO: 22epoch:train:7883-8445batch: iter_time=2.328e-04, forward_time=0.201, loss_att=55.155, acc=0.949, loss=55.155, backward_time=0.296, grad_norm=86.019, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:19:04,067 (trainer:732) INFO: 22epoch:train:8446-9008batch: iter_time=2.264e-04, forward_time=0.202, loss_att=57.248, acc=0.949, loss=57.248, backward_time=0.297, grad_norm=94.240, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:25:21,061 (trainer:732) INFO: 22epoch:train:9009-9571batch: iter_time=2.440e-04, forward_time=0.203, loss_att=56.392, acc=0.949, loss=56.392, backward_time=0.299, grad_norm=87.823, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:31:37,225 (trainer:732) INFO: 22epoch:train:9572-10134batch: iter_time=2.398e-04, forward_time=0.202, loss_att=55.239, acc=0.950, loss=55.239, backward_time=0.298, grad_norm=92.956, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:37:49,685 (trainer:732) INFO: 22epoch:train:10135-10697batch: iter_time=2.349e-04, forward_time=0.200, loss_att=53.725, acc=0.949, loss=53.725, backward_time=0.294, grad_norm=89.017, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.646 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:44:05,092 (trainer:732) INFO: 22epoch:train:10698-11260batch: iter_time=2.304e-04, forward_time=0.202, loss_att=54.341, acc=0.950, loss=54.341, backward_time=0.297, grad_norm=88.901, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:52:07,245 (trainer:338) INFO: 22epoch results: [train] iter_time=3.493e-04, forward_time=0.202, loss_att=55.870, acc=0.949, loss=55.870, backward_time=0.297, grad_norm=88.854, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 53.55 seconds, total_count=247984, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.735, acc=0.980, cer=0.025, wer=0.095, loss=11.735, time=4 minutes and 38.88 seconds, total_count=1232, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.84 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:52:10,888 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:52:10,898 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/12epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 18:52:10,898 (trainer:272) INFO: 23/60epoch started. Estimated time to finish: 3 days, 14 hours and 36 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:01:01,307 (trainer:732) INFO: 23epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=52.566, acc=0.952, loss=52.566, backward_time=0.298, grad_norm=90.355, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.773 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:07:17,434 (trainer:732) INFO: 23epoch:train:564-1126batch: iter_time=2.589e-04, forward_time=0.202, loss_att=54.296, acc=0.951, loss=54.296, backward_time=0.297, grad_norm=85.540, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:13:35,038 (trainer:732) INFO: 23epoch:train:1127-1689batch: iter_time=2.557e-04, forward_time=0.204, loss_att=56.957, acc=0.949, loss=56.957, backward_time=0.300, grad_norm=91.235, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:19:47,149 (trainer:732) INFO: 23epoch:train:1690-2252batch: iter_time=2.598e-04, forward_time=0.200, loss_att=53.393, acc=0.949, loss=53.393, backward_time=0.294, grad_norm=97.960, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.643 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:26:00,728 (trainer:732) INFO: 23epoch:train:2253-2815batch: iter_time=2.507e-04, forward_time=0.201, loss_att=52.868, acc=0.951, loss=52.868, backward_time=0.296, grad_norm=91.992, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:32:18,851 (trainer:732) INFO: 23epoch:train:2816-3378batch: iter_time=2.972e-04, forward_time=0.204, loss_att=55.267, acc=0.950, loss=55.267, backward_time=0.298, grad_norm=93.513, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:38:33,411 (trainer:732) INFO: 23epoch:train:3379-3941batch: iter_time=2.484e-04, forward_time=0.202, loss_att=54.413, acc=0.950, loss=54.413, backward_time=0.296, grad_norm=84.401, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:44:48,786 (trainer:732) INFO: 23epoch:train:3942-4504batch: iter_time=2.451e-04, forward_time=0.202, loss_att=55.740, acc=0.949, loss=55.740, backward_time=0.297, grad_norm=91.538, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:51:05,619 (trainer:732) INFO: 23epoch:train:4505-5067batch: iter_time=2.537e-04, forward_time=0.203, loss_att=55.093, acc=0.951, loss=55.093, backward_time=0.299, grad_norm=87.931, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 19:57:18,973 (trainer:732) INFO: 23epoch:train:5068-5630batch: iter_time=2.594e-04, forward_time=0.201, loss_att=54.223, acc=0.950, loss=54.223, backward_time=0.295, grad_norm=83.806, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:03:33,371 (trainer:732) INFO: 23epoch:train:5631-6193batch: iter_time=2.476e-04, forward_time=0.201, loss_att=53.495, acc=0.950, loss=53.495, backward_time=0.296, grad_norm=92.583, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:09:49,428 (trainer:732) INFO: 23epoch:train:6194-6756batch: iter_time=2.565e-04, forward_time=0.202, loss_att=55.228, acc=0.950, loss=55.228, backward_time=0.298, grad_norm=87.723, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:16:03,180 (trainer:732) INFO: 23epoch:train:6757-7319batch: iter_time=2.520e-04, forward_time=0.201, loss_att=54.641, acc=0.950, loss=54.641, backward_time=0.296, grad_norm=87.955, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:22:19,226 (trainer:732) INFO: 23epoch:train:7320-7882batch: iter_time=2.452e-04, forward_time=0.203, loss_att=54.999, acc=0.950, loss=54.999, backward_time=0.298, grad_norm=86.015, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:28:33,876 (trainer:732) INFO: 23epoch:train:7883-8445batch: iter_time=2.592e-04, forward_time=0.202, loss_att=56.273, acc=0.949, loss=56.273, backward_time=0.297, grad_norm=94.668, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:34:49,889 (trainer:732) INFO: 23epoch:train:8446-9008batch: iter_time=2.544e-04, forward_time=0.202, loss_att=54.545, acc=0.951, loss=54.545, backward_time=0.298, grad_norm=87.969, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:41:05,111 (trainer:732) INFO: 23epoch:train:9009-9571batch: iter_time=2.521e-04, forward_time=0.202, loss_att=54.375, acc=0.951, loss=54.375, backward_time=0.297, grad_norm=90.209, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:47:20,873 (trainer:732) INFO: 23epoch:train:9572-10134batch: iter_time=2.535e-04, forward_time=0.202, loss_att=55.682, acc=0.950, loss=55.682, backward_time=0.298, grad_norm=87.541, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:53:36,561 (trainer:732) INFO: 23epoch:train:10135-10697batch: iter_time=2.525e-04, forward_time=0.202, loss_att=55.393, acc=0.950, loss=55.393, backward_time=0.297, grad_norm=102.235, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 20:59:50,342 (trainer:732) INFO: 23epoch:train:10698-11260batch: iter_time=2.481e-04, forward_time=0.201, loss_att=54.688, acc=0.950, loss=54.688, backward_time=0.296, grad_norm=91.716, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:07:54,797 (trainer:338) INFO: 23epoch results: [train] iter_time=3.504e-04, forward_time=0.202, loss_att=54.676, acc=0.950, loss=54.676, backward_time=0.297, grad_norm=90.330, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.720, time=2 hours, 7 minutes and 51.75 seconds, total_count=259256, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.912, acc=0.980, cer=0.025, wer=0.094, loss=11.912, time=4 minutes and 39.14 seconds, total_count=1288, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:07:59,024 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:07:59,054 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/13epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:07:59,054 (trainer:272) INFO: 24/60epoch started. Estimated time to finish: 3 days, 12 hours and 17 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:16:52,172 (trainer:732) INFO: 24epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=53.752, acc=0.952, loss=53.752, backward_time=0.298, grad_norm=84.860, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.794 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:23:06,506 (trainer:732) INFO: 24epoch:train:564-1126batch: iter_time=2.357e-04, forward_time=0.202, loss_att=53.227, acc=0.952, loss=53.227, backward_time=0.296, grad_norm=89.321, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:29:20,498 (trainer:732) INFO: 24epoch:train:1127-1689batch: iter_time=2.306e-04, forward_time=0.201, loss_att=53.224, acc=0.951, loss=53.224, backward_time=0.296, grad_norm=87.021, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:35:36,523 (trainer:732) INFO: 24epoch:train:1690-2252batch: iter_time=2.307e-04, forward_time=0.202, loss_att=53.351, acc=0.951, loss=53.351, backward_time=0.298, grad_norm=85.940, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:41:51,583 (trainer:732) INFO: 24epoch:train:2253-2815batch: iter_time=2.454e-04, forward_time=0.202, loss_att=53.545, acc=0.952, loss=53.545, backward_time=0.297, grad_norm=86.902, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:48:07,053 (trainer:732) INFO: 24epoch:train:2816-3378batch: iter_time=2.464e-04, forward_time=0.202, loss_att=54.367, acc=0.951, loss=54.367, backward_time=0.298, grad_norm=90.954, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 21:54:23,546 (trainer:732) INFO: 24epoch:train:3379-3941batch: iter_time=2.434e-04, forward_time=0.203, loss_att=53.371, acc=0.952, loss=53.371, backward_time=0.298, grad_norm=87.147, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:00:39,315 (trainer:732) INFO: 24epoch:train:3942-4504batch: iter_time=2.366e-04, forward_time=0.202, loss_att=52.755, acc=0.952, loss=52.755, backward_time=0.297, grad_norm=89.032, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:06:54,025 (trainer:732) INFO: 24epoch:train:4505-5067batch: iter_time=2.323e-04, forward_time=0.201, loss_att=52.956, acc=0.951, loss=52.956, backward_time=0.296, grad_norm=88.220, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:13:09,329 (trainer:732) INFO: 24epoch:train:5068-5630batch: iter_time=2.484e-04, forward_time=0.202, loss_att=54.523, acc=0.951, loss=54.523, backward_time=0.297, grad_norm=85.618, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:19:27,156 (trainer:732) INFO: 24epoch:train:5631-6193batch: iter_time=2.404e-04, forward_time=0.204, loss_att=53.860, acc=0.951, loss=53.860, backward_time=0.299, grad_norm=89.090, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:25:42,303 (trainer:732) INFO: 24epoch:train:6194-6756batch: iter_time=2.391e-04, forward_time=0.201, loss_att=54.903, acc=0.950, loss=54.903, backward_time=0.297, grad_norm=90.391, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:31:58,026 (trainer:732) INFO: 24epoch:train:6757-7319batch: iter_time=2.421e-04, forward_time=0.203, loss_att=54.901, acc=0.951, loss=54.901, backward_time=0.298, grad_norm=84.391, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:38:10,434 (trainer:732) INFO: 24epoch:train:7320-7882batch: iter_time=2.392e-04, forward_time=0.200, loss_att=53.225, acc=0.951, loss=53.225, backward_time=0.294, grad_norm=87.915, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.647 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:44:25,647 (trainer:732) INFO: 24epoch:train:7883-8445batch: iter_time=2.366e-04, forward_time=0.201, loss_att=51.991, acc=0.952, loss=51.991, backward_time=0.296, grad_norm=86.527, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:50:41,063 (trainer:732) INFO: 24epoch:train:8446-9008batch: iter_time=2.316e-04, forward_time=0.202, loss_att=52.735, acc=0.952, loss=52.735, backward_time=0.297, grad_norm=90.328, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 22:56:56,801 (trainer:732) INFO: 24epoch:train:9009-9571batch: iter_time=2.331e-04, forward_time=0.202, loss_att=53.633, acc=0.951, loss=53.633, backward_time=0.297, grad_norm=85.836, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:03:12,607 (trainer:732) INFO: 24epoch:train:9572-10134batch: iter_time=2.359e-04, forward_time=0.202, loss_att=52.925, acc=0.952, loss=52.925, backward_time=0.297, grad_norm=89.129, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:09:27,703 (trainer:732) INFO: 24epoch:train:10135-10697batch: iter_time=2.471e-04, forward_time=0.202, loss_att=53.498, acc=0.951, loss=53.498, backward_time=0.296, grad_norm=86.677, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:15:42,093 (trainer:732) INFO: 24epoch:train:10698-11260batch: iter_time=2.333e-04, forward_time=0.201, loss_att=54.241, acc=0.951, loss=54.241, backward_time=0.296, grad_norm=87.096, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:24:01,946 (trainer:338) INFO: 24epoch results: [train] iter_time=3.375e-04, forward_time=0.202, loss_att=53.545, acc=0.951, loss=53.545, backward_time=0.297, grad_norm=87.598, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.722, time=2 hours, 7 minutes and 56.07 seconds, total_count=270528, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.738, acc=0.980, cer=0.024, wer=0.093, loss=11.738, time=4 minutes and 53.66 seconds, total_count=1344, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.16 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:24:05,676 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:24:05,705 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/14epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:24:05,706 (trainer:272) INFO: 25/60epoch started. Estimated time to finish: 3 days, 10 hours and 12.44 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:33:00,103 (trainer:732) INFO: 25epoch:train:1-563batch: iter_time=0.004, forward_time=0.204, loss_att=51.996, acc=0.953, loss=51.996, backward_time=0.299, grad_norm=95.163, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.803 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:39:17,684 (trainer:732) INFO: 25epoch:train:564-1126batch: iter_time=2.422e-04, forward_time=0.203, loss_att=52.353, acc=0.953, loss=52.353, backward_time=0.299, grad_norm=91.831, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:45:35,090 (trainer:732) INFO: 25epoch:train:1127-1689batch: iter_time=2.432e-04, forward_time=0.203, loss_att=53.639, acc=0.952, loss=53.639, backward_time=0.299, grad_norm=87.986, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:51:49,319 (trainer:732) INFO: 25epoch:train:1690-2252batch: iter_time=2.352e-04, forward_time=0.201, loss_att=52.160, acc=0.952, loss=52.160, backward_time=0.296, grad_norm=95.139, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-23 23:58:03,476 (trainer:732) INFO: 25epoch:train:2253-2815batch: iter_time=2.373e-04, forward_time=0.201, loss_att=52.970, acc=0.951, loss=52.970, backward_time=0.295, grad_norm=88.374, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:04:18,219 (trainer:732) INFO: 25epoch:train:2816-3378batch: iter_time=2.400e-04, forward_time=0.202, loss_att=51.582, acc=0.952, loss=51.582, backward_time=0.296, grad_norm=90.361, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:10:33,544 (trainer:732) INFO: 25epoch:train:3379-3941batch: iter_time=2.326e-04, forward_time=0.202, loss_att=53.399, acc=0.952, loss=53.399, backward_time=0.297, grad_norm=83.218, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:16:49,767 (trainer:732) INFO: 25epoch:train:3942-4504batch: iter_time=2.337e-04, forward_time=0.203, loss_att=51.772, acc=0.952, loss=51.772, backward_time=0.298, grad_norm=87.894, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:23:04,287 (trainer:732) INFO: 25epoch:train:4505-5067batch: iter_time=2.370e-04, forward_time=0.201, loss_att=52.971, acc=0.952, loss=52.971, backward_time=0.296, grad_norm=86.151, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:29:19,118 (trainer:732) INFO: 25epoch:train:5068-5630batch: iter_time=2.316e-04, forward_time=0.202, loss_att=51.598, acc=0.952, loss=51.598, backward_time=0.296, grad_norm=90.913, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:35:33,503 (trainer:732) INFO: 25epoch:train:5631-6193batch: iter_time=2.333e-04, forward_time=0.201, loss_att=52.617, acc=0.951, loss=52.617, backward_time=0.296, grad_norm=86.962, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:41:47,044 (trainer:732) INFO: 25epoch:train:6194-6756batch: iter_time=2.276e-04, forward_time=0.201, loss_att=53.666, acc=0.951, loss=53.666, backward_time=0.296, grad_norm=86.992, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:48:01,171 (trainer:732) INFO: 25epoch:train:6757-7319batch: iter_time=2.292e-04, forward_time=0.201, loss_att=51.844, acc=0.952, loss=51.844, backward_time=0.296, grad_norm=86.271, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 00:54:17,381 (trainer:732) INFO: 25epoch:train:7320-7882batch: iter_time=2.227e-04, forward_time=0.202, loss_att=52.065, acc=0.953, loss=52.065, backward_time=0.298, grad_norm=88.748, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:00:33,665 (trainer:732) INFO: 25epoch:train:7883-8445batch: iter_time=2.419e-04, forward_time=0.202, loss_att=52.836, acc=0.952, loss=52.836, backward_time=0.297, grad_norm=87.400, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:06:48,086 (trainer:732) INFO: 25epoch:train:8446-9008batch: iter_time=2.316e-04, forward_time=0.201, loss_att=51.272, acc=0.953, loss=51.272, backward_time=0.296, grad_norm=86.561, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:13:03,426 (trainer:732) INFO: 25epoch:train:9009-9571batch: iter_time=2.355e-04, forward_time=0.202, loss_att=52.958, acc=0.952, loss=52.958, backward_time=0.297, grad_norm=89.296, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:19:17,387 (trainer:732) INFO: 25epoch:train:9572-10134batch: iter_time=2.294e-04, forward_time=0.201, loss_att=51.261, acc=0.952, loss=51.261, backward_time=0.296, grad_norm=88.401, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:25:33,587 (trainer:732) INFO: 25epoch:train:10135-10697batch: iter_time=2.382e-04, forward_time=0.203, loss_att=54.178, acc=0.951, loss=54.178, backward_time=0.297, grad_norm=84.413, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:31:49,964 (trainer:732) INFO: 25epoch:train:10698-11260batch: iter_time=2.429e-04, forward_time=0.202, loss_att=52.585, acc=0.952, loss=52.585, backward_time=0.298, grad_norm=90.501, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:40:10,408 (trainer:338) INFO: 25epoch results: [train] iter_time=4.107e-04, forward_time=0.202, loss_att=52.477, acc=0.952, loss=52.477, backward_time=0.297, grad_norm=88.605, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.722, time=2 hours, 7 minutes and 56.99 seconds, total_count=281800, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.315, acc=0.981, cer=0.024, wer=0.091, loss=11.315, time=4 minutes and 52.8 seconds, total_count=1400, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.9 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:40:14,131 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:40:14,160 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/16epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:40:14,161 (trainer:272) INFO: 26/60epoch started. Estimated time to finish: 3 days, 7 hours and 42 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:49:04,835 (trainer:732) INFO: 26epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=50.698, acc=0.953, loss=50.698, backward_time=0.297, grad_norm=89.842, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.777 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 01:55:21,664 (trainer:732) INFO: 26epoch:train:564-1126batch: iter_time=2.520e-04, forward_time=0.203, loss_att=50.782, acc=0.954, loss=50.782, backward_time=0.299, grad_norm=86.685, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:01:35,260 (trainer:732) INFO: 26epoch:train:1127-1689batch: iter_time=2.549e-04, forward_time=0.201, loss_att=50.466, acc=0.953, loss=50.466, backward_time=0.295, grad_norm=85.601, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:07:50,184 (trainer:732) INFO: 26epoch:train:1690-2252batch: iter_time=2.509e-04, forward_time=0.202, loss_att=50.069, acc=0.954, loss=50.069, backward_time=0.296, grad_norm=86.618, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:14:05,091 (trainer:732) INFO: 26epoch:train:2253-2815batch: iter_time=2.434e-04, forward_time=0.202, loss_att=52.545, acc=0.952, loss=52.545, backward_time=0.296, grad_norm=86.948, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:20:21,167 (trainer:732) INFO: 26epoch:train:2816-3378batch: iter_time=2.487e-04, forward_time=0.202, loss_att=51.190, acc=0.953, loss=51.190, backward_time=0.298, grad_norm=86.575, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:26:38,155 (trainer:732) INFO: 26epoch:train:3379-3941batch: iter_time=2.531e-04, forward_time=0.203, loss_att=51.398, acc=0.953, loss=51.398, backward_time=0.298, grad_norm=86.526, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:32:53,634 (trainer:732) INFO: 26epoch:train:3942-4504batch: iter_time=2.538e-04, forward_time=0.202, loss_att=52.352, acc=0.952, loss=52.352, backward_time=0.297, grad_norm=86.468, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:39:07,146 (trainer:732) INFO: 26epoch:train:4505-5067batch: iter_time=2.525e-04, forward_time=0.201, loss_att=49.597, acc=0.954, loss=49.597, backward_time=0.296, grad_norm=79.972, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:45:24,034 (trainer:732) INFO: 26epoch:train:5068-5630batch: iter_time=2.553e-04, forward_time=0.203, loss_att=53.157, acc=0.953, loss=53.157, backward_time=0.298, grad_norm=89.393, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:51:38,545 (trainer:732) INFO: 26epoch:train:5631-6193batch: iter_time=2.437e-04, forward_time=0.202, loss_att=50.998, acc=0.953, loss=50.998, backward_time=0.296, grad_norm=86.273, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 02:57:56,041 (trainer:732) INFO: 26epoch:train:6194-6756batch: iter_time=2.528e-04, forward_time=0.203, loss_att=51.312, acc=0.954, loss=51.312, backward_time=0.298, grad_norm=94.708, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:04:12,829 (trainer:732) INFO: 26epoch:train:6757-7319batch: iter_time=2.401e-04, forward_time=0.203, loss_att=51.838, acc=0.953, loss=51.838, backward_time=0.298, grad_norm=89.762, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:10:25,422 (trainer:732) INFO: 26epoch:train:7320-7882batch: iter_time=2.445e-04, forward_time=0.200, loss_att=51.365, acc=0.952, loss=51.365, backward_time=0.294, grad_norm=90.804, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.648 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:16:41,575 (trainer:732) INFO: 26epoch:train:7883-8445batch: iter_time=2.511e-04, forward_time=0.202, loss_att=52.755, acc=0.953, loss=52.755, backward_time=0.297, grad_norm=89.949, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:22:56,160 (trainer:732) INFO: 26epoch:train:8446-9008batch: iter_time=2.465e-04, forward_time=0.201, loss_att=52.189, acc=0.953, loss=52.189, backward_time=0.296, grad_norm=88.694, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:29:11,828 (trainer:732) INFO: 26epoch:train:9009-9571batch: iter_time=2.642e-04, forward_time=0.202, loss_att=51.688, acc=0.953, loss=51.688, backward_time=0.297, grad_norm=84.873, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:35:26,921 (trainer:732) INFO: 26epoch:train:9572-10134batch: iter_time=2.845e-04, forward_time=0.202, loss_att=53.576, acc=0.952, loss=53.576, backward_time=0.297, grad_norm=87.641, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:41:42,011 (trainer:732) INFO: 26epoch:train:10135-10697batch: iter_time=2.513e-04, forward_time=0.202, loss_att=50.643, acc=0.953, loss=50.643, backward_time=0.297, grad_norm=90.967, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:47:59,372 (trainer:732) INFO: 26epoch:train:10698-11260batch: iter_time=2.448e-04, forward_time=0.203, loss_att=51.487, acc=0.954, loss=51.487, backward_time=0.299, grad_norm=88.630, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:56:22,456 (trainer:338) INFO: 26epoch results: [train] iter_time=3.921e-04, forward_time=0.202, loss_att=51.493, acc=0.953, loss=51.493, backward_time=0.297, grad_norm=87.845, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.723, time=2 hours, 7 minutes and 58.25 seconds, total_count=293072, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.381, acc=0.981, cer=0.024, wer=0.091, loss=11.381, time=4 minutes and 57.97 seconds, total_count=1456, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 12.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:56:26,459 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:56:26,485 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/15epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 03:56:26,485 (trainer:272) INFO: 27/60epoch started. Estimated time to finish: 3 days, 5 hours and 25 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:05:18,755 (trainer:732) INFO: 27epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=50.186, acc=0.954, loss=50.186, backward_time=0.298, grad_norm=85.806, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=3.788 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:11:33,678 (trainer:732) INFO: 27epoch:train:564-1126batch: iter_time=2.550e-04, forward_time=0.202, loss_att=49.698, acc=0.954, loss=49.698, backward_time=0.297, grad_norm=93.268, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:17:49,199 (trainer:732) INFO: 27epoch:train:1127-1689batch: iter_time=2.546e-04, forward_time=0.202, loss_att=51.509, acc=0.953, loss=51.509, backward_time=0.297, grad_norm=92.973, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:24:04,980 (trainer:732) INFO: 27epoch:train:1690-2252batch: iter_time=2.556e-04, forward_time=0.202, loss_att=52.188, acc=0.953, loss=52.188, backward_time=0.297, grad_norm=87.151, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:30:20,689 (trainer:732) INFO: 27epoch:train:2253-2815batch: iter_time=2.519e-04, forward_time=0.203, loss_att=50.927, acc=0.954, loss=50.927, backward_time=0.298, grad_norm=86.687, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:36:36,721 (trainer:732) INFO: 27epoch:train:2816-3378batch: iter_time=2.515e-04, forward_time=0.202, loss_att=49.369, acc=0.955, loss=49.369, backward_time=0.298, grad_norm=89.800, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:42:51,889 (trainer:732) INFO: 27epoch:train:3379-3941batch: iter_time=2.487e-04, forward_time=0.201, loss_att=51.227, acc=0.954, loss=51.227, backward_time=0.297, grad_norm=90.427, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:49:08,410 (trainer:732) INFO: 27epoch:train:3942-4504batch: iter_time=2.536e-04, forward_time=0.203, loss_att=50.792, acc=0.954, loss=50.792, backward_time=0.298, grad_norm=93.895, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 04:55:24,374 (trainer:732) INFO: 27epoch:train:4505-5067batch: iter_time=2.508e-04, forward_time=0.202, loss_att=50.359, acc=0.955, loss=50.359, backward_time=0.297, grad_norm=88.202, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:01:38,237 (trainer:732) INFO: 27epoch:train:5068-5630batch: iter_time=2.517e-04, forward_time=0.201, loss_att=51.130, acc=0.953, loss=51.130, backward_time=0.296, grad_norm=89.161, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:07:54,268 (trainer:732) INFO: 27epoch:train:5631-6193batch: iter_time=2.555e-04, forward_time=0.202, loss_att=50.844, acc=0.954, loss=50.844, backward_time=0.297, grad_norm=90.166, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:14:10,280 (trainer:732) INFO: 27epoch:train:6194-6756batch: iter_time=2.516e-04, forward_time=0.202, loss_att=50.558, acc=0.954, loss=50.558, backward_time=0.297, grad_norm=88.853, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:20:24,759 (trainer:732) INFO: 27epoch:train:6757-7319batch: iter_time=2.465e-04, forward_time=0.202, loss_att=50.150, acc=0.954, loss=50.150, backward_time=0.296, grad_norm=84.029, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:26:39,970 (trainer:732) INFO: 27epoch:train:7320-7882batch: iter_time=2.520e-04, forward_time=0.203, loss_att=49.522, acc=0.955, loss=49.522, backward_time=0.297, grad_norm=94.453, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:32:54,713 (trainer:732) INFO: 27epoch:train:7883-8445batch: iter_time=2.542e-04, forward_time=0.202, loss_att=50.614, acc=0.953, loss=50.614, backward_time=0.297, grad_norm=85.635, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:39:08,838 (trainer:732) INFO: 27epoch:train:8446-9008batch: iter_time=2.416e-04, forward_time=0.201, loss_att=49.884, acc=0.953, loss=49.884, backward_time=0.296, grad_norm=92.922, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:45:22,588 (trainer:732) INFO: 27epoch:train:9009-9571batch: iter_time=2.485e-04, forward_time=0.201, loss_att=51.621, acc=0.952, loss=51.621, backward_time=0.296, grad_norm=88.374, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:51:36,155 (trainer:732) INFO: 27epoch:train:9572-10134batch: iter_time=2.492e-04, forward_time=0.201, loss_att=50.473, acc=0.953, loss=50.473, backward_time=0.295, grad_norm=83.420, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 05:57:53,746 (trainer:732) INFO: 27epoch:train:10135-10697batch: iter_time=2.459e-04, forward_time=0.203, loss_att=51.309, acc=0.954, loss=51.309, backward_time=0.299, grad_norm=94.870, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:04:10,118 (trainer:732) INFO: 27epoch:train:10698-11260batch: iter_time=2.435e-04, forward_time=0.203, loss_att=49.380, acc=0.954, loss=49.380, backward_time=0.297, grad_norm=86.794, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:12:21,462 (trainer:338) INFO: 27epoch results: [train] iter_time=3.766e-04, forward_time=0.202, loss_att=50.581, acc=0.954, loss=50.581, backward_time=0.297, grad_norm=89.322, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.722, time=2 hours, 7 minutes and 56.24 seconds, total_count=304344, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.461, acc=0.981, cer=0.024, wer=0.091, loss=11.461, time=4 minutes and 47.57 seconds, total_count=1512, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 11.16 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:12:30,315 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:12:30,363 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/17epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:12:30,363 (trainer:272) INFO: 28/60epoch started. Estimated time to finish: 3 days, 3 hours and 8 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:21:20,614 (trainer:732) INFO: 28epoch:train:1-563batch: iter_time=0.002, forward_time=0.201, loss_att=48.744, acc=0.955, loss=48.744, backward_time=0.296, grad_norm=87.227, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.775 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:27:36,552 (trainer:732) INFO: 28epoch:train:564-1126batch: iter_time=2.350e-04, forward_time=0.202, loss_att=49.528, acc=0.955, loss=49.528, backward_time=0.298, grad_norm=92.306, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:33:51,598 (trainer:732) INFO: 28epoch:train:1127-1689batch: iter_time=2.453e-04, forward_time=0.202, loss_att=48.746, acc=0.955, loss=48.746, backward_time=0.297, grad_norm=91.756, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:40:05,263 (trainer:732) INFO: 28epoch:train:1690-2252batch: iter_time=2.445e-04, forward_time=0.201, loss_att=49.593, acc=0.954, loss=49.593, backward_time=0.295, grad_norm=84.505, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:46:19,053 (trainer:732) INFO: 28epoch:train:2253-2815batch: iter_time=2.479e-04, forward_time=0.202, loss_att=48.231, acc=0.955, loss=48.231, backward_time=0.296, grad_norm=87.124, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:52:35,576 (trainer:732) INFO: 28epoch:train:2816-3378batch: iter_time=2.395e-04, forward_time=0.203, loss_att=50.280, acc=0.955, loss=50.280, backward_time=0.299, grad_norm=87.834, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 06:58:53,104 (trainer:732) INFO: 28epoch:train:3379-3941batch: iter_time=2.319e-04, forward_time=0.203, loss_att=49.701, acc=0.955, loss=49.701, backward_time=0.299, grad_norm=87.797, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:05:07,851 (trainer:732) INFO: 28epoch:train:3942-4504batch: iter_time=2.367e-04, forward_time=0.201, loss_att=48.719, acc=0.955, loss=48.719, backward_time=0.296, grad_norm=89.273, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:11:23,004 (trainer:732) INFO: 28epoch:train:4505-5067batch: iter_time=2.322e-04, forward_time=0.201, loss_att=49.777, acc=0.955, loss=49.777, backward_time=0.297, grad_norm=87.603, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:17:37,030 (trainer:732) INFO: 28epoch:train:5068-5630batch: iter_time=2.415e-04, forward_time=0.201, loss_att=49.905, acc=0.954, loss=49.905, backward_time=0.295, grad_norm=90.490, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:23:53,334 (trainer:732) INFO: 28epoch:train:5631-6193batch: iter_time=2.339e-04, forward_time=0.202, loss_att=51.521, acc=0.954, loss=51.521, backward_time=0.298, grad_norm=92.819, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:30:09,732 (trainer:732) INFO: 28epoch:train:6194-6756batch: iter_time=2.317e-04, forward_time=0.203, loss_att=51.172, acc=0.954, loss=51.172, backward_time=0.298, grad_norm=82.027, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:36:22,844 (trainer:732) INFO: 28epoch:train:6757-7319batch: iter_time=2.298e-04, forward_time=0.201, loss_att=48.481, acc=0.955, loss=48.481, backward_time=0.295, grad_norm=81.177, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:42:38,643 (trainer:732) INFO: 28epoch:train:7320-7882batch: iter_time=2.431e-04, forward_time=0.202, loss_att=50.639, acc=0.953, loss=50.639, backward_time=0.298, grad_norm=87.241, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:48:54,452 (trainer:732) INFO: 28epoch:train:7883-8445batch: iter_time=2.473e-04, forward_time=0.202, loss_att=50.276, acc=0.954, loss=50.276, backward_time=0.297, grad_norm=91.870, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 07:55:07,355 (trainer:732) INFO: 28epoch:train:8446-9008batch: iter_time=2.360e-04, forward_time=0.201, loss_att=49.738, acc=0.954, loss=49.738, backward_time=0.295, grad_norm=85.779, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.648 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:01:21,043 (trainer:732) INFO: 28epoch:train:9009-9571batch: iter_time=2.296e-04, forward_time=0.201, loss_att=50.401, acc=0.954, loss=50.401, backward_time=0.296, grad_norm=91.115, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:07:37,300 (trainer:732) INFO: 28epoch:train:9572-10134batch: iter_time=2.339e-04, forward_time=0.203, loss_att=50.044, acc=0.955, loss=50.044, backward_time=0.298, grad_norm=83.913, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:13:52,977 (trainer:732) INFO: 28epoch:train:10135-10697batch: iter_time=2.363e-04, forward_time=0.202, loss_att=49.918, acc=0.955, loss=49.918, backward_time=0.298, grad_norm=90.425, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:20:10,372 (trainer:732) INFO: 28epoch:train:10698-11260batch: iter_time=2.252e-04, forward_time=0.203, loss_att=50.050, acc=0.955, loss=50.050, backward_time=0.299, grad_norm=89.505, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:28:13,502 (trainer:338) INFO: 28epoch results: [train] iter_time=3.394e-04, forward_time=0.202, loss_att=49.765, acc=0.955, loss=49.765, backward_time=0.297, grad_norm=88.083, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.721, time=2 hours, 7 minutes and 52.44 seconds, total_count=315616, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.496, acc=0.981, cer=0.023, wer=0.090, loss=11.496, time=4 minutes and 39.08 seconds, total_count=1568, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 11.62 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:28:17,108 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:28:17,137 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/18epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:28:17,137 (trainer:272) INFO: 29/60epoch started. Estimated time to finish: 3 days, 50 minutes and 39.69 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:37:08,833 (trainer:732) INFO: 29epoch:train:1-563batch: iter_time=0.001, forward_time=0.202, loss_att=47.114, acc=0.957, loss=47.114, backward_time=0.298, grad_norm=88.595, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.784 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:43:24,609 (trainer:732) INFO: 29epoch:train:564-1126batch: iter_time=2.487e-04, forward_time=0.203, loss_att=48.815, acc=0.955, loss=48.815, backward_time=0.298, grad_norm=87.784, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:49:38,828 (trainer:732) INFO: 29epoch:train:1127-1689batch: iter_time=2.361e-04, forward_time=0.202, loss_att=48.432, acc=0.955, loss=48.432, backward_time=0.296, grad_norm=95.876, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 08:55:53,519 (trainer:732) INFO: 29epoch:train:1690-2252batch: iter_time=2.403e-04, forward_time=0.202, loss_att=49.304, acc=0.955, loss=49.304, backward_time=0.297, grad_norm=92.483, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:02:08,216 (trainer:732) INFO: 29epoch:train:2253-2815batch: iter_time=2.479e-04, forward_time=0.201, loss_att=47.928, acc=0.956, loss=47.928, backward_time=0.296, grad_norm=81.198, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:08:24,313 (trainer:732) INFO: 29epoch:train:2816-3378batch: iter_time=2.659e-04, forward_time=0.202, loss_att=49.987, acc=0.955, loss=49.987, backward_time=0.297, grad_norm=86.142, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:14:38,510 (trainer:732) INFO: 29epoch:train:3379-3941batch: iter_time=2.477e-04, forward_time=0.201, loss_att=49.200, acc=0.955, loss=49.200, backward_time=0.296, grad_norm=92.048, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:20:54,961 (trainer:732) INFO: 29epoch:train:3942-4504batch: iter_time=2.434e-04, forward_time=0.202, loss_att=49.567, acc=0.955, loss=49.567, backward_time=0.298, grad_norm=88.286, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:27:12,443 (trainer:732) INFO: 29epoch:train:4505-5067batch: iter_time=2.345e-04, forward_time=0.203, loss_att=49.244, acc=0.956, loss=49.244, backward_time=0.299, grad_norm=91.353, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.994e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:33:28,494 (trainer:732) INFO: 29epoch:train:5068-5630batch: iter_time=2.489e-04, forward_time=0.202, loss_att=48.083, acc=0.956, loss=48.083, backward_time=0.298, grad_norm=84.777, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.985e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:39:43,833 (trainer:732) INFO: 29epoch:train:5631-6193batch: iter_time=2.364e-04, forward_time=0.202, loss_att=49.141, acc=0.955, loss=49.141, backward_time=0.297, grad_norm=85.426, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.976e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:45:57,300 (trainer:732) INFO: 29epoch:train:6194-6756batch: iter_time=2.400e-04, forward_time=0.201, loss_att=49.101, acc=0.955, loss=49.101, backward_time=0.296, grad_norm=89.710, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.967e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:52:13,230 (trainer:732) INFO: 29epoch:train:6757-7319batch: iter_time=2.406e-04, forward_time=0.202, loss_att=50.490, acc=0.955, loss=50.490, backward_time=0.298, grad_norm=92.308, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.959e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 09:58:26,878 (trainer:732) INFO: 29epoch:train:7320-7882batch: iter_time=2.363e-04, forward_time=0.201, loss_att=48.853, acc=0.955, loss=48.853, backward_time=0.295, grad_norm=88.635, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.950e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:04:42,493 (trainer:732) INFO: 29epoch:train:7883-8445batch: iter_time=2.463e-04, forward_time=0.202, loss_att=48.714, acc=0.955, loss=48.714, backward_time=0.297, grad_norm=88.996, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.941e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:10:57,303 (trainer:732) INFO: 29epoch:train:8446-9008batch: iter_time=2.459e-04, forward_time=0.201, loss_att=49.039, acc=0.955, loss=49.039, backward_time=0.296, grad_norm=85.678, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.933e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:17:13,425 (trainer:732) INFO: 29epoch:train:9009-9571batch: iter_time=2.463e-04, forward_time=0.203, loss_att=50.441, acc=0.955, loss=50.441, backward_time=0.298, grad_norm=89.013, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.924e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:23:25,780 (trainer:732) INFO: 29epoch:train:9572-10134batch: iter_time=2.461e-04, forward_time=0.200, loss_att=47.776, acc=0.955, loss=47.776, backward_time=0.295, grad_norm=84.317, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.916e-04, train_time=2.644 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:29:41,535 (trainer:732) INFO: 29epoch:train:10135-10697batch: iter_time=2.476e-04, forward_time=0.202, loss_att=49.646, acc=0.955, loss=49.646, backward_time=0.297, grad_norm=86.635, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.907e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:35:57,533 (trainer:732) INFO: 29epoch:train:10698-11260batch: iter_time=2.384e-04, forward_time=0.202, loss_att=49.191, acc=0.955, loss=49.191, backward_time=0.298, grad_norm=86.606, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.898e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:43:59,220 (trainer:338) INFO: 29epoch results: [train] iter_time=3.032e-04, forward_time=0.202, loss_att=48.994, acc=0.955, loss=48.994, backward_time=0.297, grad_norm=88.286, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.981e-04, train_time=2.721, time=2 hours, 7 minutes and 53.12 seconds, total_count=326888, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.037, acc=0.982, cer=0.023, wer=0.087, loss=11.037, time=4 minutes and 38.93 seconds, total_count=1624, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.03 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:44:02,855 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:44:02,865 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/19epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:44:02,865 (trainer:272) INFO: 30/60epoch started. Estimated time to finish: 2 days, 22 hours and 33 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:52:53,139 (trainer:732) INFO: 30epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=47.037, acc=0.956, loss=47.037, backward_time=0.297, grad_norm=87.253, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.890e-04, train_time=3.773 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 10:59:11,136 (trainer:732) INFO: 30epoch:train:564-1126batch: iter_time=2.631e-04, forward_time=0.203, loss_att=48.199, acc=0.957, loss=48.199, backward_time=0.299, grad_norm=86.671, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.881e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:05:26,309 (trainer:732) INFO: 30epoch:train:1127-1689batch: iter_time=2.619e-04, forward_time=0.202, loss_att=46.673, acc=0.957, loss=46.673, backward_time=0.297, grad_norm=86.303, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.873e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:11:42,354 (trainer:732) INFO: 30epoch:train:1690-2252batch: iter_time=2.500e-04, forward_time=0.202, loss_att=49.642, acc=0.955, loss=49.642, backward_time=0.297, grad_norm=80.373, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.864e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:17:57,989 (trainer:732) INFO: 30epoch:train:2253-2815batch: iter_time=2.537e-04, forward_time=0.202, loss_att=46.575, acc=0.957, loss=46.575, backward_time=0.297, grad_norm=88.333, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.856e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:24:10,964 (trainer:732) INFO: 30epoch:train:2816-3378batch: iter_time=2.594e-04, forward_time=0.201, loss_att=47.723, acc=0.956, loss=47.723, backward_time=0.295, grad_norm=87.737, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.847e-04, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:30:27,288 (trainer:732) INFO: 30epoch:train:3379-3941batch: iter_time=2.504e-04, forward_time=0.203, loss_att=47.363, acc=0.957, loss=47.363, backward_time=0.298, grad_norm=95.620, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.839e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:36:41,986 (trainer:732) INFO: 30epoch:train:3942-4504batch: iter_time=2.514e-04, forward_time=0.201, loss_att=48.621, acc=0.955, loss=48.621, backward_time=0.296, grad_norm=85.730, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.831e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:42:57,761 (trainer:732) INFO: 30epoch:train:4505-5067batch: iter_time=2.630e-04, forward_time=0.203, loss_att=48.528, acc=0.956, loss=48.528, backward_time=0.298, grad_norm=81.217, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.822e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:49:14,916 (trainer:732) INFO: 30epoch:train:5068-5630batch: iter_time=2.585e-04, forward_time=0.203, loss_att=49.331, acc=0.956, loss=49.331, backward_time=0.299, grad_norm=91.096, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.814e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 11:55:28,801 (trainer:732) INFO: 30epoch:train:5631-6193batch: iter_time=2.555e-04, forward_time=0.201, loss_att=49.018, acc=0.955, loss=49.018, backward_time=0.296, grad_norm=83.438, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.806e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:01:44,335 (trainer:732) INFO: 30epoch:train:6194-6756batch: iter_time=2.537e-04, forward_time=0.202, loss_att=49.855, acc=0.956, loss=49.855, backward_time=0.297, grad_norm=91.054, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.797e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:07:58,604 (trainer:732) INFO: 30epoch:train:6757-7319batch: iter_time=2.703e-04, forward_time=0.201, loss_att=47.301, acc=0.956, loss=47.301, backward_time=0.296, grad_norm=85.638, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.789e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:14:12,191 (trainer:732) INFO: 30epoch:train:7320-7882batch: iter_time=2.541e-04, forward_time=0.201, loss_att=48.220, acc=0.955, loss=48.220, backward_time=0.296, grad_norm=84.806, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.781e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:20:27,764 (trainer:732) INFO: 30epoch:train:7883-8445batch: iter_time=2.569e-04, forward_time=0.202, loss_att=47.833, acc=0.956, loss=47.833, backward_time=0.297, grad_norm=83.605, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.773e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:26:42,512 (trainer:732) INFO: 30epoch:train:8446-9008batch: iter_time=2.535e-04, forward_time=0.202, loss_att=49.153, acc=0.955, loss=49.153, backward_time=0.297, grad_norm=85.299, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.765e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:32:56,466 (trainer:732) INFO: 30epoch:train:9009-9571batch: iter_time=2.536e-04, forward_time=0.201, loss_att=47.750, acc=0.956, loss=47.750, backward_time=0.296, grad_norm=88.471, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.756e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:39:12,179 (trainer:732) INFO: 30epoch:train:9572-10134batch: iter_time=2.511e-04, forward_time=0.202, loss_att=48.372, acc=0.956, loss=48.372, backward_time=0.297, grad_norm=86.719, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.748e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:45:26,446 (trainer:732) INFO: 30epoch:train:10135-10697batch: iter_time=2.595e-04, forward_time=0.202, loss_att=47.830, acc=0.956, loss=47.830, backward_time=0.296, grad_norm=88.669, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.740e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:51:44,278 (trainer:732) INFO: 30epoch:train:10698-11260batch: iter_time=2.458e-04, forward_time=0.203, loss_att=49.021, acc=0.956, loss=49.021, backward_time=0.299, grad_norm=87.277, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.732e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:59:43,361 (trainer:338) INFO: 30epoch results: [train] iter_time=3.396e-04, forward_time=0.202, loss_att=48.194, acc=0.956, loss=48.194, backward_time=0.297, grad_norm=86.783, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.810e-04, train_time=2.721, time=2 hours, 7 minutes and 54.26 seconds, total_count=338160, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.450, acc=0.981, cer=0.023, wer=0.090, loss=11.450, time=4 minutes and 36.94 seconds, total_count=1680, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 9.29 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:59:47,050 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:59:47,077 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/20epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 12:59:47,077 (trainer:272) INFO: 31/60epoch started. Estimated time to finish: 2 days, 20 hours and 15 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:08:39,053 (trainer:732) INFO: 31epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=47.343, acc=0.958, loss=47.343, backward_time=0.299, grad_norm=86.353, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.724e-04, train_time=3.786 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:14:55,020 (trainer:732) INFO: 31epoch:train:564-1126batch: iter_time=2.449e-04, forward_time=0.202, loss_att=46.891, acc=0.957, loss=46.891, backward_time=0.297, grad_norm=87.599, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.716e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:21:10,161 (trainer:732) INFO: 31epoch:train:1127-1689batch: iter_time=2.459e-04, forward_time=0.202, loss_att=46.954, acc=0.957, loss=46.954, backward_time=0.297, grad_norm=85.620, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.708e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:27:26,159 (trainer:732) INFO: 31epoch:train:1690-2252batch: iter_time=2.420e-04, forward_time=0.203, loss_att=48.258, acc=0.956, loss=48.258, backward_time=0.298, grad_norm=87.778, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.699e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:33:40,797 (trainer:732) INFO: 31epoch:train:2253-2815batch: iter_time=2.250e-04, forward_time=0.201, loss_att=46.140, acc=0.958, loss=46.140, backward_time=0.297, grad_norm=86.434, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.691e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:39:56,301 (trainer:732) INFO: 31epoch:train:2816-3378batch: iter_time=2.335e-04, forward_time=0.202, loss_att=47.475, acc=0.956, loss=47.475, backward_time=0.297, grad_norm=86.515, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.683e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:46:11,907 (trainer:732) INFO: 31epoch:train:3379-3941batch: iter_time=2.236e-04, forward_time=0.202, loss_att=47.163, acc=0.957, loss=47.163, backward_time=0.297, grad_norm=85.573, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.676e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:52:25,597 (trainer:732) INFO: 31epoch:train:3942-4504batch: iter_time=2.320e-04, forward_time=0.201, loss_att=47.311, acc=0.956, loss=47.311, backward_time=0.295, grad_norm=82.823, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.668e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 13:58:41,504 (trainer:732) INFO: 31epoch:train:4505-5067batch: iter_time=2.316e-04, forward_time=0.202, loss_att=47.467, acc=0.956, loss=47.467, backward_time=0.298, grad_norm=86.271, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.660e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:04:55,304 (trainer:732) INFO: 31epoch:train:5068-5630batch: iter_time=2.297e-04, forward_time=0.201, loss_att=47.110, acc=0.957, loss=47.110, backward_time=0.296, grad_norm=85.892, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.652e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:11:11,544 (trainer:732) INFO: 31epoch:train:5631-6193batch: iter_time=2.359e-04, forward_time=0.202, loss_att=48.122, acc=0.957, loss=48.122, backward_time=0.298, grad_norm=87.042, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.644e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:17:26,399 (trainer:732) INFO: 31epoch:train:6194-6756batch: iter_time=2.238e-04, forward_time=0.202, loss_att=48.205, acc=0.956, loss=48.205, backward_time=0.296, grad_norm=84.002, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.636e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:23:39,927 (trainer:732) INFO: 31epoch:train:6757-7319batch: iter_time=2.205e-04, forward_time=0.201, loss_att=47.382, acc=0.956, loss=47.382, backward_time=0.296, grad_norm=85.669, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.628e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:29:54,646 (trainer:732) INFO: 31epoch:train:7320-7882batch: iter_time=2.280e-04, forward_time=0.201, loss_att=46.990, acc=0.957, loss=46.990, backward_time=0.297, grad_norm=86.996, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.620e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:36:09,210 (trainer:732) INFO: 31epoch:train:7883-8445batch: iter_time=2.401e-04, forward_time=0.202, loss_att=48.257, acc=0.956, loss=48.257, backward_time=0.296, grad_norm=93.851, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.612e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:42:25,803 (trainer:732) INFO: 31epoch:train:8446-9008batch: iter_time=2.338e-04, forward_time=0.202, loss_att=47.667, acc=0.956, loss=47.667, backward_time=0.298, grad_norm=87.527, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.605e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:48:38,164 (trainer:732) INFO: 31epoch:train:9009-9571batch: iter_time=2.419e-04, forward_time=0.201, loss_att=46.939, acc=0.956, loss=46.939, backward_time=0.295, grad_norm=83.732, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.597e-04, train_time=2.646 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 14:54:53,045 (trainer:732) INFO: 31epoch:train:9572-10134batch: iter_time=2.249e-04, forward_time=0.202, loss_att=48.157, acc=0.956, loss=48.157, backward_time=0.297, grad_norm=84.387, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.589e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:01:09,805 (trainer:732) INFO: 31epoch:train:10135-10697batch: iter_time=2.277e-04, forward_time=0.202, loss_att=47.340, acc=0.957, loss=47.340, backward_time=0.298, grad_norm=87.180, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.581e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:07:25,617 (trainer:732) INFO: 31epoch:train:10698-11260batch: iter_time=2.285e-04, forward_time=0.202, loss_att=48.584, acc=0.957, loss=48.584, backward_time=0.298, grad_norm=92.381, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.574e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:15:28,519 (trainer:338) INFO: 31epoch results: [train] iter_time=3.069e-04, forward_time=0.202, loss_att=47.481, acc=0.957, loss=47.481, backward_time=0.297, grad_norm=86.685, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.648e-04, train_time=2.720, time=2 hours, 7 minutes and 51.61 seconds, total_count=349432, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.364, acc=0.981, cer=0.023, wer=0.089, loss=11.364, time=4 minutes and 38.88 seconds, total_count=1736, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.95 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:15:32,101 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:15:32,130 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/21epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:15:32,131 (trainer:272) INFO: 32/60epoch started. Estimated time to finish: 2 days, 17 hours and 58 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:24:23,222 (trainer:732) INFO: 32epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=45.831, acc=0.959, loss=45.831, backward_time=0.298, grad_norm=87.896, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.566e-04, train_time=3.781 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:30:38,280 (trainer:732) INFO: 32epoch:train:564-1126batch: iter_time=2.389e-04, forward_time=0.202, loss_att=46.349, acc=0.957, loss=46.349, backward_time=0.297, grad_norm=84.762, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.558e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:36:52,343 (trainer:732) INFO: 32epoch:train:1127-1689batch: iter_time=2.388e-04, forward_time=0.201, loss_att=45.291, acc=0.958, loss=45.291, backward_time=0.296, grad_norm=82.656, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.550e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:43:05,476 (trainer:732) INFO: 32epoch:train:1690-2252batch: iter_time=2.267e-04, forward_time=0.201, loss_att=47.183, acc=0.957, loss=47.183, backward_time=0.295, grad_norm=82.623, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.543e-04, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:49:20,186 (trainer:732) INFO: 32epoch:train:2253-2815batch: iter_time=2.346e-04, forward_time=0.202, loss_att=45.723, acc=0.957, loss=45.723, backward_time=0.296, grad_norm=82.440, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.535e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 15:55:34,515 (trainer:732) INFO: 32epoch:train:2816-3378batch: iter_time=2.353e-04, forward_time=0.201, loss_att=46.627, acc=0.957, loss=46.627, backward_time=0.296, grad_norm=82.590, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.527e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:01:48,925 (trainer:732) INFO: 32epoch:train:3379-3941batch: iter_time=2.303e-04, forward_time=0.201, loss_att=47.205, acc=0.957, loss=47.205, backward_time=0.296, grad_norm=83.668, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.520e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:08:03,828 (trainer:732) INFO: 32epoch:train:3942-4504batch: iter_time=2.287e-04, forward_time=0.201, loss_att=46.232, acc=0.957, loss=46.232, backward_time=0.296, grad_norm=84.197, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.512e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:14:20,174 (trainer:732) INFO: 32epoch:train:4505-5067batch: iter_time=2.270e-04, forward_time=0.203, loss_att=47.029, acc=0.957, loss=47.029, backward_time=0.299, grad_norm=86.660, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.505e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:20:33,352 (trainer:732) INFO: 32epoch:train:5068-5630batch: iter_time=2.322e-04, forward_time=0.201, loss_att=45.599, acc=0.957, loss=45.599, backward_time=0.295, grad_norm=86.671, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.497e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:26:48,412 (trainer:732) INFO: 32epoch:train:5631-6193batch: iter_time=2.255e-04, forward_time=0.202, loss_att=46.555, acc=0.957, loss=46.555, backward_time=0.297, grad_norm=86.635, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.490e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:33:02,448 (trainer:732) INFO: 32epoch:train:6194-6756batch: iter_time=2.274e-04, forward_time=0.201, loss_att=47.515, acc=0.957, loss=47.515, backward_time=0.296, grad_norm=88.266, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.482e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:39:18,228 (trainer:732) INFO: 32epoch:train:6757-7319batch: iter_time=2.406e-04, forward_time=0.202, loss_att=47.123, acc=0.957, loss=47.123, backward_time=0.297, grad_norm=85.205, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.475e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:45:33,421 (trainer:732) INFO: 32epoch:train:7320-7882batch: iter_time=2.316e-04, forward_time=0.202, loss_att=46.706, acc=0.957, loss=46.706, backward_time=0.297, grad_norm=82.748, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.467e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:51:47,593 (trainer:732) INFO: 32epoch:train:7883-8445batch: iter_time=2.406e-04, forward_time=0.201, loss_att=45.542, acc=0.958, loss=45.542, backward_time=0.296, grad_norm=89.198, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.460e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 16:58:03,772 (trainer:732) INFO: 32epoch:train:8446-9008batch: iter_time=2.399e-04, forward_time=0.202, loss_att=47.369, acc=0.957, loss=47.369, backward_time=0.298, grad_norm=92.191, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.452e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:04:19,343 (trainer:732) INFO: 32epoch:train:9009-9571batch: iter_time=2.425e-04, forward_time=0.202, loss_att=47.587, acc=0.957, loss=47.587, backward_time=0.297, grad_norm=84.032, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.445e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:10:39,200 (trainer:732) INFO: 32epoch:train:9572-10134batch: iter_time=2.355e-04, forward_time=0.204, loss_att=47.691, acc=0.958, loss=47.691, backward_time=0.301, grad_norm=95.175, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.437e-04, train_time=2.699 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:16:56,194 (trainer:732) INFO: 32epoch:train:10135-10697batch: iter_time=2.283e-04, forward_time=0.203, loss_att=48.472, acc=0.957, loss=48.472, backward_time=0.299, grad_norm=85.830, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.430e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:23:11,518 (trainer:732) INFO: 32epoch:train:10698-11260batch: iter_time=2.255e-04, forward_time=0.202, loss_att=47.301, acc=0.957, loss=47.301, backward_time=0.297, grad_norm=90.058, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.423e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:31:15,954 (trainer:338) INFO: 32epoch results: [train] iter_time=3.199e-04, forward_time=0.202, loss_att=46.739, acc=0.957, loss=46.739, backward_time=0.297, grad_norm=86.179, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.494e-04, train_time=2.720, time=2 hours, 7 minutes and 51.45 seconds, total_count=360704, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.929, acc=0.982, cer=0.023, wer=0.086, loss=10.929, time=4 minutes and 39.34 seconds, total_count=1792, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.02 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:31:20,084 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:31:20,095 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/22epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:31:20,096 (trainer:272) INFO: 33/60epoch started. Estimated time to finish: 2 days, 15 hours and 41 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:40:10,526 (trainer:732) INFO: 33epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=44.035, acc=0.959, loss=44.035, backward_time=0.297, grad_norm=83.746, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.415e-04, train_time=3.775 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:46:26,606 (trainer:732) INFO: 33epoch:train:564-1126batch: iter_time=2.462e-04, forward_time=0.202, loss_att=45.371, acc=0.959, loss=45.371, backward_time=0.298, grad_norm=105.788, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.408e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:52:43,399 (trainer:732) INFO: 33epoch:train:1127-1689batch: iter_time=2.378e-04, forward_time=0.203, loss_att=46.386, acc=0.958, loss=46.386, backward_time=0.298, grad_norm=88.274, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.401e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 17:58:57,360 (trainer:732) INFO: 33epoch:train:1690-2252batch: iter_time=2.357e-04, forward_time=0.201, loss_att=45.773, acc=0.958, loss=45.773, backward_time=0.295, grad_norm=87.967, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.393e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:05:15,619 (trainer:732) INFO: 33epoch:train:2253-2815batch: iter_time=2.302e-04, forward_time=0.204, loss_att=46.588, acc=0.959, loss=46.588, backward_time=0.300, grad_norm=86.376, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.386e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:11:30,794 (trainer:732) INFO: 33epoch:train:2816-3378batch: iter_time=2.365e-04, forward_time=0.202, loss_att=45.287, acc=0.958, loss=45.287, backward_time=0.297, grad_norm=84.906, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.379e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:17:46,976 (trainer:732) INFO: 33epoch:train:3379-3941batch: iter_time=2.304e-04, forward_time=0.202, loss_att=46.512, acc=0.958, loss=46.512, backward_time=0.298, grad_norm=89.467, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.371e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:24:01,599 (trainer:732) INFO: 33epoch:train:3942-4504batch: iter_time=2.306e-04, forward_time=0.201, loss_att=45.344, acc=0.958, loss=45.344, backward_time=0.296, grad_norm=92.387, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.364e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:30:14,978 (trainer:732) INFO: 33epoch:train:4505-5067batch: iter_time=2.409e-04, forward_time=0.201, loss_att=44.309, acc=0.958, loss=44.309, backward_time=0.295, grad_norm=85.297, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.357e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:36:30,483 (trainer:732) INFO: 33epoch:train:5068-5630batch: iter_time=2.343e-04, forward_time=0.202, loss_att=46.475, acc=0.958, loss=46.475, backward_time=0.297, grad_norm=86.607, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.350e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:42:44,560 (trainer:732) INFO: 33epoch:train:5631-6193batch: iter_time=2.404e-04, forward_time=0.201, loss_att=46.442, acc=0.957, loss=46.442, backward_time=0.296, grad_norm=90.863, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.343e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:49:00,844 (trainer:732) INFO: 33epoch:train:6194-6756batch: iter_time=2.319e-04, forward_time=0.203, loss_att=45.853, acc=0.958, loss=45.853, backward_time=0.298, grad_norm=86.951, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.335e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 18:55:16,537 (trainer:732) INFO: 33epoch:train:6757-7319batch: iter_time=2.283e-04, forward_time=0.202, loss_att=47.439, acc=0.958, loss=47.439, backward_time=0.298, grad_norm=81.853, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.328e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:01:30,456 (trainer:732) INFO: 33epoch:train:7320-7882batch: iter_time=2.381e-04, forward_time=0.201, loss_att=45.855, acc=0.957, loss=45.855, backward_time=0.296, grad_norm=90.888, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.321e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:07:44,868 (trainer:732) INFO: 33epoch:train:7883-8445batch: iter_time=2.344e-04, forward_time=0.202, loss_att=46.256, acc=0.957, loss=46.256, backward_time=0.297, grad_norm=89.858, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.314e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:13:59,528 (trainer:732) INFO: 33epoch:train:8446-9008batch: iter_time=2.219e-04, forward_time=0.201, loss_att=46.822, acc=0.957, loss=46.822, backward_time=0.296, grad_norm=89.146, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.307e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:20:14,935 (trainer:732) INFO: 33epoch:train:9009-9571batch: iter_time=2.306e-04, forward_time=0.202, loss_att=47.146, acc=0.957, loss=47.146, backward_time=0.298, grad_norm=92.671, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.300e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:26:29,107 (trainer:732) INFO: 33epoch:train:9572-10134batch: iter_time=2.240e-04, forward_time=0.201, loss_att=46.696, acc=0.957, loss=46.696, backward_time=0.296, grad_norm=83.526, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.293e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:32:43,604 (trainer:732) INFO: 33epoch:train:10135-10697batch: iter_time=2.258e-04, forward_time=0.202, loss_att=46.939, acc=0.957, loss=46.939, backward_time=0.297, grad_norm=82.740, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.286e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:38:59,270 (trainer:732) INFO: 33epoch:train:10698-11260batch: iter_time=2.261e-04, forward_time=0.202, loss_att=46.881, acc=0.957, loss=46.881, backward_time=0.297, grad_norm=86.949, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.279e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:46:58,458 (trainer:338) INFO: 33epoch results: [train] iter_time=3.135e-04, forward_time=0.202, loss_att=46.116, acc=0.958, loss=46.116, backward_time=0.297, grad_norm=88.309, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.346e-04, train_time=2.721, time=2 hours, 7 minutes and 52.15 seconds, total_count=371976, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.154, acc=0.982, cer=0.023, wer=0.087, loss=11.154, time=4 minutes and 37 seconds, total_count=1848, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 9.21 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:47:02,331 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:47:02,364 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/23epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:47:02,365 (trainer:272) INFO: 34/60epoch started. Estimated time to finish: 2 days, 13 hours and 24 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 19:55:50,783 (trainer:732) INFO: 34epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=44.691, acc=0.959, loss=44.691, backward_time=0.297, grad_norm=93.793, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.272e-04, train_time=3.760 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:02:09,014 (trainer:732) INFO: 34epoch:train:564-1126batch: iter_time=2.729e-04, forward_time=0.203, loss_att=45.903, acc=0.959, loss=45.903, backward_time=0.300, grad_norm=87.142, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.265e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:08:26,401 (trainer:732) INFO: 34epoch:train:1127-1689batch: iter_time=2.650e-04, forward_time=0.203, loss_att=45.444, acc=0.959, loss=45.444, backward_time=0.298, grad_norm=83.825, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.258e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:14:39,428 (trainer:732) INFO: 34epoch:train:1690-2252batch: iter_time=2.466e-04, forward_time=0.201, loss_att=44.552, acc=0.959, loss=44.552, backward_time=0.294, grad_norm=85.729, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.251e-04, train_time=2.649 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:20:52,372 (trainer:732) INFO: 34epoch:train:2253-2815batch: iter_time=2.388e-04, forward_time=0.202, loss_att=46.034, acc=0.958, loss=46.034, backward_time=0.294, grad_norm=88.276, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.244e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:27:06,163 (trainer:732) INFO: 34epoch:train:2816-3378batch: iter_time=2.489e-04, forward_time=0.202, loss_att=45.828, acc=0.958, loss=45.828, backward_time=0.296, grad_norm=88.715, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.237e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:33:17,099 (trainer:732) INFO: 34epoch:train:3379-3941batch: iter_time=2.422e-04, forward_time=0.201, loss_att=45.339, acc=0.958, loss=45.339, backward_time=0.293, grad_norm=79.706, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.230e-04, train_time=2.634 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:39:29,034 (trainer:732) INFO: 34epoch:train:3942-4504batch: iter_time=2.401e-04, forward_time=0.201, loss_att=46.362, acc=0.958, loss=46.362, backward_time=0.293, grad_norm=81.437, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.223e-04, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:45:39,632 (trainer:732) INFO: 34epoch:train:4505-5067batch: iter_time=2.529e-04, forward_time=0.200, loss_att=45.634, acc=0.958, loss=45.634, backward_time=0.292, grad_norm=83.886, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.216e-04, train_time=2.634 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:51:53,092 (trainer:732) INFO: 34epoch:train:5068-5630batch: iter_time=2.443e-04, forward_time=0.202, loss_att=45.465, acc=0.959, loss=45.465, backward_time=0.295, grad_norm=84.530, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.209e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 20:58:05,406 (trainer:732) INFO: 34epoch:train:5631-6193batch: iter_time=2.383e-04, forward_time=0.200, loss_att=45.172, acc=0.959, loss=45.172, backward_time=0.293, grad_norm=85.054, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.202e-04, train_time=2.645 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:04:19,307 (trainer:732) INFO: 34epoch:train:6194-6756batch: iter_time=2.352e-04, forward_time=0.202, loss_att=45.491, acc=0.959, loss=45.491, backward_time=0.295, grad_norm=87.619, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.195e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:10:30,801 (trainer:732) INFO: 34epoch:train:6757-7319batch: iter_time=2.389e-04, forward_time=0.201, loss_att=45.646, acc=0.958, loss=45.646, backward_time=0.293, grad_norm=92.790, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.189e-04, train_time=2.641 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:16:42,661 (trainer:732) INFO: 34epoch:train:7320-7882batch: iter_time=2.453e-04, forward_time=0.201, loss_att=45.893, acc=0.958, loss=45.893, backward_time=0.293, grad_norm=79.685, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.182e-04, train_time=2.640 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:22:55,799 (trainer:732) INFO: 34epoch:train:7883-8445batch: iter_time=2.413e-04, forward_time=0.201, loss_att=46.289, acc=0.958, loss=46.289, backward_time=0.294, grad_norm=86.857, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.175e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:29:07,754 (trainer:732) INFO: 34epoch:train:8446-9008batch: iter_time=2.400e-04, forward_time=0.201, loss_att=45.334, acc=0.957, loss=45.334, backward_time=0.293, grad_norm=83.457, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.168e-04, train_time=2.641 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:35:18,254 (trainer:732) INFO: 34epoch:train:9009-9571batch: iter_time=2.329e-04, forward_time=0.201, loss_att=44.987, acc=0.958, loss=44.987, backward_time=0.293, grad_norm=81.013, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.161e-04, train_time=2.633 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:41:30,224 (trainer:732) INFO: 34epoch:train:9572-10134batch: iter_time=2.401e-04, forward_time=0.202, loss_att=45.473, acc=0.959, loss=45.473, backward_time=0.294, grad_norm=82.822, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.155e-04, train_time=2.640 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:47:42,506 (trainer:732) INFO: 34epoch:train:10135-10697batch: iter_time=2.412e-04, forward_time=0.201, loss_att=45.425, acc=0.958, loss=45.425, backward_time=0.294, grad_norm=85.968, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.148e-04, train_time=2.647 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 21:53:55,635 (trainer:732) INFO: 34epoch:train:10698-11260batch: iter_time=2.341e-04, forward_time=0.201, loss_att=45.458, acc=0.958, loss=45.458, backward_time=0.294, grad_norm=85.707, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.141e-04, train_time=2.650 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:01:47,924 (trainer:338) INFO: 34epoch results: [train] iter_time=3.427e-04, forward_time=0.201, loss_att=45.524, acc=0.958, loss=45.524, backward_time=0.294, grad_norm=85.413, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.206e-04, train_time=2.704, time=2 hours, 7 minutes and 5.6 seconds, total_count=383248, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.360, acc=0.981, cer=0.022, wer=0.086, loss=11.360, time=4 minutes and 29.85 seconds, total_count=1904, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 10.11 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:01:52,264 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:01:52,291 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/24epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:01:52,292 (trainer:272) INFO: 35/60epoch started. Estimated time to finish: 2 days, 11 hours and 6 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:10:37,518 (trainer:732) INFO: 35epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=46.289, acc=0.959, loss=46.289, backward_time=0.295, grad_norm=90.968, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.134e-04, train_time=3.736 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:16:51,119 (trainer:732) INFO: 35epoch:train:564-1126batch: iter_time=2.485e-04, forward_time=0.202, loss_att=45.058, acc=0.959, loss=45.058, backward_time=0.295, grad_norm=84.845, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.128e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:23:03,191 (trainer:732) INFO: 35epoch:train:1127-1689batch: iter_time=2.562e-04, forward_time=0.201, loss_att=44.101, acc=0.959, loss=44.101, backward_time=0.294, grad_norm=88.225, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.121e-04, train_time=2.643 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:29:16,743 (trainer:732) INFO: 35epoch:train:1690-2252batch: iter_time=2.646e-04, forward_time=0.202, loss_att=45.147, acc=0.959, loss=45.147, backward_time=0.294, grad_norm=86.724, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=9.114e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:35:29,959 (trainer:732) INFO: 35epoch:train:2253-2815batch: iter_time=2.631e-04, forward_time=0.202, loss_att=43.863, acc=0.959, loss=43.863, backward_time=0.294, grad_norm=86.315, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.108e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:41:41,308 (trainer:732) INFO: 35epoch:train:2816-3378batch: iter_time=2.746e-04, forward_time=0.201, loss_att=45.472, acc=0.959, loss=45.472, backward_time=0.293, grad_norm=82.424, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.101e-04, train_time=2.639 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:47:53,208 (trainer:732) INFO: 35epoch:train:3379-3941batch: iter_time=2.703e-04, forward_time=0.201, loss_att=44.353, acc=0.959, loss=44.353, backward_time=0.293, grad_norm=86.624, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=9.094e-04, train_time=2.642 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 22:54:08,665 (trainer:732) INFO: 35epoch:train:3942-4504batch: iter_time=2.689e-04, forward_time=0.202, loss_att=45.008, acc=0.959, loss=45.008, backward_time=0.296, grad_norm=87.472, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.088e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:00:20,869 (trainer:732) INFO: 35epoch:train:4505-5067batch: iter_time=2.692e-04, forward_time=0.200, loss_att=42.910, acc=0.960, loss=42.910, backward_time=0.294, grad_norm=87.852, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.081e-04, train_time=2.646 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:06:35,470 (trainer:732) INFO: 35epoch:train:5068-5630batch: iter_time=2.881e-04, forward_time=0.202, loss_att=45.175, acc=0.959, loss=45.175, backward_time=0.296, grad_norm=90.828, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.075e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:12:50,857 (trainer:732) INFO: 35epoch:train:5631-6193batch: iter_time=2.877e-04, forward_time=0.202, loss_att=44.443, acc=0.959, loss=44.443, backward_time=0.296, grad_norm=94.081, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.068e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:19:06,229 (trainer:732) INFO: 35epoch:train:6194-6756batch: iter_time=2.733e-04, forward_time=0.202, loss_att=44.892, acc=0.958, loss=44.892, backward_time=0.296, grad_norm=83.188, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.061e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:25:22,463 (trainer:732) INFO: 35epoch:train:6757-7319batch: iter_time=2.832e-04, forward_time=0.203, loss_att=44.962, acc=0.958, loss=44.962, backward_time=0.298, grad_norm=84.497, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.055e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:31:40,317 (trainer:732) INFO: 35epoch:train:7320-7882batch: iter_time=2.735e-04, forward_time=0.203, loss_att=46.955, acc=0.958, loss=46.955, backward_time=0.299, grad_norm=89.133, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.048e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:37:56,008 (trainer:732) INFO: 35epoch:train:7883-8445batch: iter_time=2.894e-04, forward_time=0.202, loss_att=44.395, acc=0.959, loss=44.395, backward_time=0.297, grad_norm=81.524, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.042e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:44:12,253 (trainer:732) INFO: 35epoch:train:8446-9008batch: iter_time=2.792e-04, forward_time=0.202, loss_att=45.230, acc=0.959, loss=45.230, backward_time=0.297, grad_norm=87.090, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.035e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:50:26,384 (trainer:732) INFO: 35epoch:train:9009-9571batch: iter_time=2.794e-04, forward_time=0.202, loss_att=44.938, acc=0.959, loss=44.938, backward_time=0.296, grad_norm=86.498, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.029e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-24 23:56:44,198 (trainer:732) INFO: 35epoch:train:9572-10134batch: iter_time=2.825e-04, forward_time=0.204, loss_att=45.437, acc=0.959, loss=45.437, backward_time=0.299, grad_norm=89.430, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.022e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:02:59,507 (trainer:732) INFO: 35epoch:train:10135-10697batch: iter_time=2.683e-04, forward_time=0.202, loss_att=43.420, acc=0.960, loss=43.420, backward_time=0.297, grad_norm=91.359, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.016e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:09:17,329 (trainer:732) INFO: 35epoch:train:10698-11260batch: iter_time=2.795e-04, forward_time=0.203, loss_att=46.258, acc=0.959, loss=46.258, backward_time=0.299, grad_norm=88.557, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.009e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:17:42,600 (trainer:338) INFO: 35epoch results: [train] iter_time=3.781e-04, forward_time=0.202, loss_att=44.903, acc=0.959, loss=44.903, backward_time=0.296, grad_norm=87.398, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=9.071e-04, train_time=2.715, time=2 hours, 7 minutes and 38.45 seconds, total_count=394520, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.859, acc=0.982, cer=0.022, wer=0.085, loss=10.859, time=4 minutes and 55.84 seconds, total_count=1960, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 16.02 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:17:46,567 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:17:46,597 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/28epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:17:46,597 (trainer:272) INFO: 36/60epoch started. Estimated time to finish: 2 days, 8 hours and 49 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:26:43,365 (trainer:732) INFO: 36epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=44.228, acc=0.960, loss=44.228, backward_time=0.298, grad_norm=83.583, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.003e-04, train_time=3.819 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:32:59,171 (trainer:732) INFO: 36epoch:train:564-1126batch: iter_time=2.685e-04, forward_time=0.202, loss_att=44.722, acc=0.959, loss=44.722, backward_time=0.297, grad_norm=83.116, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.997e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:39:14,507 (trainer:732) INFO: 36epoch:train:1127-1689batch: iter_time=2.761e-04, forward_time=0.202, loss_att=43.763, acc=0.959, loss=43.763, backward_time=0.296, grad_norm=86.635, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.990e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:45:32,646 (trainer:732) INFO: 36epoch:train:1690-2252batch: iter_time=2.860e-04, forward_time=0.203, loss_att=43.830, acc=0.960, loss=43.830, backward_time=0.296, grad_norm=88.301, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.984e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:51:50,185 (trainer:732) INFO: 36epoch:train:2253-2815batch: iter_time=2.820e-04, forward_time=0.203, loss_att=43.348, acc=0.959, loss=43.348, backward_time=0.297, grad_norm=97.913, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.977e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 00:58:06,315 (trainer:732) INFO: 36epoch:train:2816-3378batch: iter_time=2.643e-04, forward_time=0.202, loss_att=43.427, acc=0.960, loss=43.427, backward_time=0.298, grad_norm=88.539, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.971e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:04:26,298 (trainer:732) INFO: 36epoch:train:3379-3941batch: iter_time=2.783e-04, forward_time=0.204, loss_att=44.846, acc=0.959, loss=44.846, backward_time=0.298, grad_norm=89.417, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.965e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:10:43,700 (trainer:732) INFO: 36epoch:train:3942-4504batch: iter_time=2.781e-04, forward_time=0.202, loss_att=43.558, acc=0.959, loss=43.558, backward_time=0.296, grad_norm=86.486, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.958e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:16:57,502 (trainer:732) INFO: 36epoch:train:4505-5067batch: iter_time=2.646e-04, forward_time=0.201, loss_att=43.673, acc=0.959, loss=43.673, backward_time=0.295, grad_norm=81.635, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.952e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:23:12,351 (trainer:732) INFO: 36epoch:train:5068-5630batch: iter_time=2.604e-04, forward_time=0.201, loss_att=43.847, acc=0.959, loss=43.847, backward_time=0.296, grad_norm=85.048, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.946e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:29:30,449 (trainer:732) INFO: 36epoch:train:5631-6193batch: iter_time=2.698e-04, forward_time=0.203, loss_att=44.724, acc=0.959, loss=44.724, backward_time=0.298, grad_norm=81.246, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.939e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:35:46,640 (trainer:732) INFO: 36epoch:train:6194-6756batch: iter_time=2.554e-04, forward_time=0.202, loss_att=44.921, acc=0.959, loss=44.921, backward_time=0.298, grad_norm=93.425, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.933e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:42:01,891 (trainer:732) INFO: 36epoch:train:6757-7319batch: iter_time=2.639e-04, forward_time=0.202, loss_att=44.467, acc=0.959, loss=44.467, backward_time=0.297, grad_norm=85.679, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.927e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:48:17,020 (trainer:732) INFO: 36epoch:train:7320-7882batch: iter_time=2.562e-04, forward_time=0.202, loss_att=44.376, acc=0.959, loss=44.376, backward_time=0.296, grad_norm=85.845, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.921e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 01:54:33,584 (trainer:732) INFO: 36epoch:train:7883-8445batch: iter_time=2.659e-04, forward_time=0.203, loss_att=43.088, acc=0.960, loss=43.088, backward_time=0.298, grad_norm=85.357, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.914e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:00:51,078 (trainer:732) INFO: 36epoch:train:8446-9008batch: iter_time=2.603e-04, forward_time=0.203, loss_att=45.718, acc=0.959, loss=45.718, backward_time=0.298, grad_norm=88.059, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.908e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:07:08,677 (trainer:732) INFO: 36epoch:train:9009-9571batch: iter_time=2.573e-04, forward_time=0.203, loss_att=45.432, acc=0.959, loss=45.432, backward_time=0.299, grad_norm=85.898, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.902e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:13:24,768 (trainer:732) INFO: 36epoch:train:9572-10134batch: iter_time=2.592e-04, forward_time=0.203, loss_att=46.226, acc=0.958, loss=46.226, backward_time=0.298, grad_norm=82.543, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.896e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:19:39,653 (trainer:732) INFO: 36epoch:train:10135-10697batch: iter_time=2.682e-04, forward_time=0.202, loss_att=44.950, acc=0.959, loss=44.950, backward_time=0.296, grad_norm=89.972, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.890e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:25:56,968 (trainer:732) INFO: 36epoch:train:10698-11260batch: iter_time=2.624e-04, forward_time=0.203, loss_att=44.004, acc=0.960, loss=44.004, backward_time=0.298, grad_norm=82.103, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.883e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:34:24,960 (trainer:338) INFO: 36epoch results: [train] iter_time=3.666e-04, forward_time=0.202, loss_att=44.345, acc=0.959, loss=44.345, backward_time=0.297, grad_norm=86.525, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.943e-04, train_time=2.731, time=2 hours, 8 minutes and 30.12 seconds, total_count=405792, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.848, acc=0.982, cer=0.022, wer=0.085, loss=10.848, time=4 minutes and 51.63 seconds, total_count=2016, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 16.61 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:34:29,018 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:34:29,031 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/25epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:34:29,032 (trainer:272) INFO: 37/60epoch started. Estimated time to finish: 2 days, 6 hours and 33 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:43:26,099 (trainer:732) INFO: 37epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=42.925, acc=0.960, loss=42.925, backward_time=0.297, grad_norm=88.060, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.877e-04, train_time=3.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:49:40,279 (trainer:732) INFO: 37epoch:train:564-1126batch: iter_time=2.622e-04, forward_time=0.201, loss_att=42.773, acc=0.960, loss=42.773, backward_time=0.296, grad_norm=87.858, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.871e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 02:55:55,983 (trainer:732) INFO: 37epoch:train:1127-1689batch: iter_time=2.711e-04, forward_time=0.202, loss_att=43.731, acc=0.960, loss=43.731, backward_time=0.297, grad_norm=86.961, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.865e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:02:13,257 (trainer:732) INFO: 37epoch:train:1690-2252batch: iter_time=2.651e-04, forward_time=0.203, loss_att=44.246, acc=0.960, loss=44.246, backward_time=0.298, grad_norm=83.355, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.859e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:08:31,202 (trainer:732) INFO: 37epoch:train:2253-2815batch: iter_time=2.617e-04, forward_time=0.204, loss_att=43.728, acc=0.960, loss=43.728, backward_time=0.300, grad_norm=89.646, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.853e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:14:47,466 (trainer:732) INFO: 37epoch:train:2816-3378batch: iter_time=2.710e-04, forward_time=0.203, loss_att=43.924, acc=0.960, loss=43.924, backward_time=0.297, grad_norm=85.245, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.846e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:21:04,284 (trainer:732) INFO: 37epoch:train:3379-3941batch: iter_time=2.660e-04, forward_time=0.203, loss_att=44.643, acc=0.960, loss=44.643, backward_time=0.298, grad_norm=87.738, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.840e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:27:21,381 (trainer:732) INFO: 37epoch:train:3942-4504batch: iter_time=2.605e-04, forward_time=0.204, loss_att=43.718, acc=0.960, loss=43.718, backward_time=0.299, grad_norm=85.713, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.834e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:33:36,627 (trainer:732) INFO: 37epoch:train:4505-5067batch: iter_time=2.668e-04, forward_time=0.202, loss_att=43.132, acc=0.960, loss=43.132, backward_time=0.297, grad_norm=86.088, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.828e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:39:52,563 (trainer:732) INFO: 37epoch:train:5068-5630batch: iter_time=2.718e-04, forward_time=0.203, loss_att=44.412, acc=0.960, loss=44.412, backward_time=0.297, grad_norm=84.884, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.822e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:46:09,765 (trainer:732) INFO: 37epoch:train:5631-6193batch: iter_time=2.667e-04, forward_time=0.203, loss_att=43.371, acc=0.961, loss=43.371, backward_time=0.298, grad_norm=84.840, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.816e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:52:25,066 (trainer:732) INFO: 37epoch:train:6194-6756batch: iter_time=2.695e-04, forward_time=0.202, loss_att=44.028, acc=0.959, loss=44.028, backward_time=0.297, grad_norm=88.405, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.810e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 03:58:38,221 (trainer:732) INFO: 37epoch:train:6757-7319batch: iter_time=2.599e-04, forward_time=0.201, loss_att=43.301, acc=0.960, loss=43.301, backward_time=0.296, grad_norm=82.713, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.804e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:04:54,941 (trainer:732) INFO: 37epoch:train:7320-7882batch: iter_time=2.589e-04, forward_time=0.203, loss_att=42.875, acc=0.960, loss=42.875, backward_time=0.298, grad_norm=86.355, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.798e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:11:10,091 (trainer:732) INFO: 37epoch:train:7883-8445batch: iter_time=2.676e-04, forward_time=0.201, loss_att=43.372, acc=0.960, loss=43.372, backward_time=0.296, grad_norm=82.232, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.792e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:17:24,521 (trainer:732) INFO: 37epoch:train:8446-9008batch: iter_time=2.639e-04, forward_time=0.202, loss_att=44.267, acc=0.959, loss=44.267, backward_time=0.296, grad_norm=83.943, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.786e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:23:39,438 (trainer:732) INFO: 37epoch:train:9009-9571batch: iter_time=2.625e-04, forward_time=0.202, loss_att=44.824, acc=0.959, loss=44.824, backward_time=0.297, grad_norm=80.403, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.780e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:29:54,897 (trainer:732) INFO: 37epoch:train:9572-10134batch: iter_time=2.648e-04, forward_time=0.202, loss_att=44.334, acc=0.959, loss=44.334, backward_time=0.297, grad_norm=86.191, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.774e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:36:08,776 (trainer:732) INFO: 37epoch:train:10135-10697batch: iter_time=2.650e-04, forward_time=0.201, loss_att=43.324, acc=0.960, loss=43.324, backward_time=0.295, grad_norm=85.696, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.768e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:42:25,240 (trainer:732) INFO: 37epoch:train:10698-11260batch: iter_time=2.570e-04, forward_time=0.202, loss_att=44.869, acc=0.960, loss=44.869, backward_time=0.297, grad_norm=93.834, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.762e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:49:53,910 (trainer:338) INFO: 37epoch results: [train] iter_time=3.905e-04, forward_time=0.202, loss_att=43.786, acc=0.960, loss=43.786, backward_time=0.297, grad_norm=86.006, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.819e-04, train_time=2.726, time=2 hours, 8 minutes and 10.61 seconds, total_count=417064, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.733, acc=0.983, cer=0.022, wer=0.084, loss=10.733, time=3 minutes and 59.15 seconds, total_count=2072, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.11 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:49:57,950 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:49:57,963 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/27epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:49:57,964 (trainer:272) INFO: 38/60epoch started. Estimated time to finish: 2 days, 4 hours and 16 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 04:58:52,367 (trainer:732) INFO: 38epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=41.415, acc=0.961, loss=41.415, backward_time=0.296, grad_norm=82.993, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.756e-04, train_time=3.803 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:05:09,826 (trainer:732) INFO: 38epoch:train:564-1126batch: iter_time=2.759e-04, forward_time=0.203, loss_att=42.758, acc=0.961, loss=42.758, backward_time=0.298, grad_norm=84.485, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.750e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:11:25,833 (trainer:732) INFO: 38epoch:train:1127-1689batch: iter_time=2.756e-04, forward_time=0.202, loss_att=42.929, acc=0.961, loss=42.929, backward_time=0.297, grad_norm=88.585, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.745e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:17:43,652 (trainer:732) INFO: 38epoch:train:1690-2252batch: iter_time=2.850e-04, forward_time=0.204, loss_att=43.000, acc=0.961, loss=43.000, backward_time=0.299, grad_norm=93.890, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.739e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:23:59,826 (trainer:732) INFO: 38epoch:train:2253-2815batch: iter_time=2.661e-04, forward_time=0.203, loss_att=42.671, acc=0.961, loss=42.671, backward_time=0.298, grad_norm=87.481, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.733e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:30:14,805 (trainer:732) INFO: 38epoch:train:2816-3378batch: iter_time=2.698e-04, forward_time=0.202, loss_att=42.179, acc=0.961, loss=42.179, backward_time=0.296, grad_norm=86.859, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.727e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:36:29,973 (trainer:732) INFO: 38epoch:train:3379-3941batch: iter_time=2.848e-04, forward_time=0.202, loss_att=43.440, acc=0.960, loss=43.440, backward_time=0.297, grad_norm=83.472, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.721e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:42:45,109 (trainer:732) INFO: 38epoch:train:3942-4504batch: iter_time=2.682e-04, forward_time=0.202, loss_att=42.443, acc=0.961, loss=42.443, backward_time=0.296, grad_norm=87.561, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.715e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:48:59,565 (trainer:732) INFO: 38epoch:train:4505-5067batch: iter_time=2.628e-04, forward_time=0.202, loss_att=44.475, acc=0.960, loss=44.475, backward_time=0.296, grad_norm=86.472, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.710e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 05:55:14,798 (trainer:732) INFO: 38epoch:train:5068-5630batch: iter_time=2.638e-04, forward_time=0.202, loss_att=42.599, acc=0.961, loss=42.599, backward_time=0.297, grad_norm=88.998, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.704e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:01:33,531 (trainer:732) INFO: 38epoch:train:5631-6193batch: iter_time=2.682e-04, forward_time=0.204, loss_att=44.186, acc=0.961, loss=44.186, backward_time=0.300, grad_norm=88.623, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.698e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:07:49,076 (trainer:732) INFO: 38epoch:train:6194-6756batch: iter_time=2.660e-04, forward_time=0.202, loss_att=42.796, acc=0.961, loss=42.796, backward_time=0.297, grad_norm=86.773, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.692e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:14:02,608 (trainer:732) INFO: 38epoch:train:6757-7319batch: iter_time=2.704e-04, forward_time=0.201, loss_att=43.876, acc=0.960, loss=43.876, backward_time=0.296, grad_norm=90.460, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.686e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:20:18,540 (trainer:732) INFO: 38epoch:train:7320-7882batch: iter_time=2.607e-04, forward_time=0.202, loss_att=44.750, acc=0.959, loss=44.750, backward_time=0.298, grad_norm=87.606, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.681e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:26:34,803 (trainer:732) INFO: 38epoch:train:7883-8445batch: iter_time=2.652e-04, forward_time=0.203, loss_att=43.956, acc=0.960, loss=43.956, backward_time=0.298, grad_norm=88.500, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.675e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:32:51,611 (trainer:732) INFO: 38epoch:train:8446-9008batch: iter_time=2.747e-04, forward_time=0.202, loss_att=44.458, acc=0.960, loss=44.458, backward_time=0.298, grad_norm=86.520, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.669e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:39:08,043 (trainer:732) INFO: 38epoch:train:9009-9571batch: iter_time=2.686e-04, forward_time=0.203, loss_att=42.592, acc=0.961, loss=42.592, backward_time=0.298, grad_norm=85.809, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.663e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:45:21,379 (trainer:732) INFO: 38epoch:train:9572-10134batch: iter_time=2.727e-04, forward_time=0.201, loss_att=42.990, acc=0.960, loss=42.990, backward_time=0.295, grad_norm=81.685, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.658e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:51:35,599 (trainer:732) INFO: 38epoch:train:10135-10697batch: iter_time=2.680e-04, forward_time=0.202, loss_att=43.683, acc=0.960, loss=43.683, backward_time=0.296, grad_norm=94.641, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.652e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 06:57:50,710 (trainer:732) INFO: 38epoch:train:10698-11260batch: iter_time=2.613e-04, forward_time=0.202, loss_att=44.379, acc=0.959, loss=44.379, backward_time=0.297, grad_norm=86.525, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.646e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:05:55,764 (trainer:338) INFO: 38epoch results: [train] iter_time=4.040e-04, forward_time=0.202, loss_att=43.279, acc=0.960, loss=43.279, backward_time=0.297, grad_norm=87.393, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.701e-04, train_time=2.725, time=2 hours, 8 minutes and 11.98 seconds, total_count=428336, gpu_max_cached_mem_GB=30.271, [valid] loss_att=11.074, acc=0.982, cer=0.022, wer=0.085, loss=11.074, time=4 minutes and 32.52 seconds, total_count=2128, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.3 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:06:02,278 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:06:02,291 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/26epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:06:02,292 (trainer:272) INFO: 39/60epoch started. Estimated time to finish: 2 days, 2 hours and 9.14 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:14:56,113 (trainer:732) INFO: 39epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=41.560, acc=0.961, loss=41.560, backward_time=0.297, grad_norm=84.298, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.640e-04, train_time=3.800 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:21:12,201 (trainer:732) INFO: 39epoch:train:564-1126batch: iter_time=2.583e-04, forward_time=0.203, loss_att=43.556, acc=0.961, loss=43.556, backward_time=0.298, grad_norm=89.796, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.635e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:27:27,234 (trainer:732) INFO: 39epoch:train:1127-1689batch: iter_time=2.528e-04, forward_time=0.202, loss_att=42.271, acc=0.961, loss=42.271, backward_time=0.297, grad_norm=84.523, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.629e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:33:42,138 (trainer:732) INFO: 39epoch:train:1690-2252batch: iter_time=2.534e-04, forward_time=0.202, loss_att=42.265, acc=0.961, loss=42.265, backward_time=0.297, grad_norm=83.953, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.623e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:39:56,690 (trainer:732) INFO: 39epoch:train:2253-2815batch: iter_time=2.441e-04, forward_time=0.201, loss_att=42.325, acc=0.961, loss=42.325, backward_time=0.296, grad_norm=84.029, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.618e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:46:11,895 (trainer:732) INFO: 39epoch:train:2816-3378batch: iter_time=2.523e-04, forward_time=0.202, loss_att=43.249, acc=0.960, loss=43.249, backward_time=0.297, grad_norm=85.675, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.612e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:52:28,054 (trainer:732) INFO: 39epoch:train:3379-3941batch: iter_time=2.436e-04, forward_time=0.202, loss_att=42.407, acc=0.961, loss=42.407, backward_time=0.297, grad_norm=83.489, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.607e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 07:58:42,631 (trainer:732) INFO: 39epoch:train:3942-4504batch: iter_time=2.521e-04, forward_time=0.202, loss_att=42.837, acc=0.961, loss=42.837, backward_time=0.296, grad_norm=83.583, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.601e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:04:58,666 (trainer:732) INFO: 39epoch:train:4505-5067batch: iter_time=2.490e-04, forward_time=0.202, loss_att=43.177, acc=0.961, loss=43.177, backward_time=0.297, grad_norm=85.642, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.595e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:11:15,475 (trainer:732) INFO: 39epoch:train:5068-5630batch: iter_time=2.558e-04, forward_time=0.203, loss_att=43.170, acc=0.961, loss=43.170, backward_time=0.298, grad_norm=87.986, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.590e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:17:32,790 (trainer:732) INFO: 39epoch:train:5631-6193batch: iter_time=2.484e-04, forward_time=0.203, loss_att=43.546, acc=0.961, loss=43.546, backward_time=0.299, grad_norm=87.917, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.584e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:23:47,919 (trainer:732) INFO: 39epoch:train:6194-6756batch: iter_time=2.528e-04, forward_time=0.201, loss_att=42.138, acc=0.961, loss=42.138, backward_time=0.296, grad_norm=86.604, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.579e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:30:03,876 (trainer:732) INFO: 39epoch:train:6757-7319batch: iter_time=2.485e-04, forward_time=0.203, loss_att=43.753, acc=0.960, loss=43.753, backward_time=0.299, grad_norm=80.675, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.573e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:36:21,492 (trainer:732) INFO: 39epoch:train:7320-7882batch: iter_time=2.495e-04, forward_time=0.203, loss_att=43.623, acc=0.961, loss=43.623, backward_time=0.299, grad_norm=83.426, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.568e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:42:37,781 (trainer:732) INFO: 39epoch:train:7883-8445batch: iter_time=2.501e-04, forward_time=0.203, loss_att=42.774, acc=0.960, loss=42.774, backward_time=0.298, grad_norm=88.460, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.562e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:48:51,548 (trainer:732) INFO: 39epoch:train:8446-9008batch: iter_time=2.481e-04, forward_time=0.201, loss_att=42.087, acc=0.961, loss=42.087, backward_time=0.295, grad_norm=88.296, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.557e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 08:55:07,659 (trainer:732) INFO: 39epoch:train:9009-9571batch: iter_time=2.468e-04, forward_time=0.203, loss_att=43.035, acc=0.960, loss=43.035, backward_time=0.298, grad_norm=88.218, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.551e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:01:21,815 (trainer:732) INFO: 39epoch:train:9572-10134batch: iter_time=2.470e-04, forward_time=0.201, loss_att=43.185, acc=0.959, loss=43.185, backward_time=0.296, grad_norm=81.067, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.546e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:07:37,870 (trainer:732) INFO: 39epoch:train:10135-10697batch: iter_time=2.484e-04, forward_time=0.202, loss_att=42.904, acc=0.961, loss=42.904, backward_time=0.298, grad_norm=86.907, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.540e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:13:52,340 (trainer:732) INFO: 39epoch:train:10698-11260batch: iter_time=2.485e-04, forward_time=0.201, loss_att=42.973, acc=0.960, loss=42.973, backward_time=0.295, grad_norm=82.653, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.535e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:21:57,368 (trainer:338) INFO: 39epoch results: [train] iter_time=3.549e-04, forward_time=0.202, loss_att=42.829, acc=0.961, loss=42.829, backward_time=0.297, grad_norm=85.372, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.587e-04, train_time=2.724, time=2 hours, 8 minutes and 6.05 seconds, total_count=439608, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.903, acc=0.982, cer=0.022, wer=0.084, loss=10.903, time=4 minutes and 35.39 seconds, total_count=2184, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.63 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:22:01,036 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:22:01,048 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/30epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:22:01,048 (trainer:272) INFO: 40/60epoch started. Estimated time to finish: 1 day, 23 hours and 43 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:30:54,555 (trainer:732) INFO: 40epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=41.026, acc=0.962, loss=41.026, backward_time=0.295, grad_norm=89.091, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.529e-04, train_time=3.797 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:37:09,629 (trainer:732) INFO: 40epoch:train:564-1126batch: iter_time=2.666e-04, forward_time=0.202, loss_att=40.685, acc=0.962, loss=40.685, backward_time=0.297, grad_norm=82.072, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.524e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:43:26,183 (trainer:732) INFO: 40epoch:train:1127-1689batch: iter_time=2.551e-04, forward_time=0.203, loss_att=42.359, acc=0.962, loss=42.359, backward_time=0.298, grad_norm=84.924, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.518e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:49:40,429 (trainer:732) INFO: 40epoch:train:1690-2252batch: iter_time=2.598e-04, forward_time=0.201, loss_att=42.616, acc=0.960, loss=42.616, backward_time=0.296, grad_norm=80.099, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.513e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 09:55:55,229 (trainer:732) INFO: 40epoch:train:2253-2815batch: iter_time=2.633e-04, forward_time=0.201, loss_att=41.923, acc=0.961, loss=41.923, backward_time=0.296, grad_norm=91.958, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.507e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:02:12,670 (trainer:732) INFO: 40epoch:train:2816-3378batch: iter_time=2.677e-04, forward_time=0.203, loss_att=41.772, acc=0.962, loss=41.772, backward_time=0.299, grad_norm=85.343, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.502e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:08:29,292 (trainer:732) INFO: 40epoch:train:3379-3941batch: iter_time=2.641e-04, forward_time=0.203, loss_att=42.870, acc=0.961, loss=42.870, backward_time=0.298, grad_norm=86.238, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.496e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:14:44,940 (trainer:732) INFO: 40epoch:train:3942-4504batch: iter_time=2.673e-04, forward_time=0.202, loss_att=43.763, acc=0.960, loss=43.763, backward_time=0.297, grad_norm=86.379, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.491e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:21:01,209 (trainer:732) INFO: 40epoch:train:4505-5067batch: iter_time=2.647e-04, forward_time=0.202, loss_att=42.880, acc=0.961, loss=42.880, backward_time=0.298, grad_norm=82.755, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.486e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:27:15,199 (trainer:732) INFO: 40epoch:train:5068-5630batch: iter_time=2.660e-04, forward_time=0.201, loss_att=41.040, acc=0.961, loss=41.040, backward_time=0.296, grad_norm=74.940, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.480e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:33:31,038 (trainer:732) INFO: 40epoch:train:5631-6193batch: iter_time=2.655e-04, forward_time=0.203, loss_att=42.526, acc=0.961, loss=42.526, backward_time=0.297, grad_norm=80.077, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.475e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:39:46,368 (trainer:732) INFO: 40epoch:train:6194-6756batch: iter_time=2.648e-04, forward_time=0.202, loss_att=43.741, acc=0.960, loss=43.741, backward_time=0.297, grad_norm=86.578, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.470e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:46:02,450 (trainer:732) INFO: 40epoch:train:6757-7319batch: iter_time=2.619e-04, forward_time=0.203, loss_att=41.506, acc=0.962, loss=41.506, backward_time=0.298, grad_norm=87.548, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.464e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:52:18,773 (trainer:732) INFO: 40epoch:train:7320-7882batch: iter_time=2.615e-04, forward_time=0.202, loss_att=42.390, acc=0.961, loss=42.390, backward_time=0.297, grad_norm=85.515, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.459e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 10:58:33,872 (trainer:732) INFO: 40epoch:train:7883-8445batch: iter_time=2.693e-04, forward_time=0.202, loss_att=41.962, acc=0.961, loss=41.962, backward_time=0.297, grad_norm=92.343, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.454e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:04:51,328 (trainer:732) INFO: 40epoch:train:8446-9008batch: iter_time=2.616e-04, forward_time=0.204, loss_att=43.336, acc=0.961, loss=43.336, backward_time=0.299, grad_norm=93.796, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.448e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:11:07,141 (trainer:732) INFO: 40epoch:train:9009-9571batch: iter_time=2.572e-04, forward_time=0.203, loss_att=42.426, acc=0.961, loss=42.426, backward_time=0.298, grad_norm=85.851, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.443e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:17:20,480 (trainer:732) INFO: 40epoch:train:9572-10134batch: iter_time=2.674e-04, forward_time=0.201, loss_att=42.694, acc=0.960, loss=42.694, backward_time=0.296, grad_norm=86.067, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.438e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:23:35,281 (trainer:732) INFO: 40epoch:train:10135-10697batch: iter_time=2.660e-04, forward_time=0.202, loss_att=42.918, acc=0.961, loss=42.918, backward_time=0.297, grad_norm=82.852, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.432e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:29:52,495 (trainer:732) INFO: 40epoch:train:10698-11260batch: iter_time=2.584e-04, forward_time=0.203, loss_att=42.790, acc=0.961, loss=42.790, backward_time=0.299, grad_norm=87.467, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.427e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:37:56,051 (trainer:338) INFO: 40epoch results: [train] iter_time=3.629e-04, forward_time=0.202, loss_att=42.345, acc=0.961, loss=42.345, backward_time=0.297, grad_norm=85.565, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.478e-04, train_time=2.725, time=2 hours, 8 minutes and 7.71 seconds, total_count=450880, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.613, acc=0.983, cer=0.021, wer=0.082, loss=10.613, time=4 minutes and 31.59 seconds, total_count=2240, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.68 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:37:59,731 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:37:59,743 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/34epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:37:59,744 (trainer:272) INFO: 41/60epoch started. Estimated time to finish: 1 day, 21 hours and 27 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:46:55,416 (trainer:732) INFO: 41epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=40.292, acc=0.962, loss=40.292, backward_time=0.298, grad_norm=83.991, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.422e-04, train_time=3.811 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:53:11,574 (trainer:732) INFO: 41epoch:train:564-1126batch: iter_time=2.620e-04, forward_time=0.203, loss_att=41.654, acc=0.962, loss=41.654, backward_time=0.298, grad_norm=96.572, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.417e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 11:59:30,044 (trainer:732) INFO: 41epoch:train:1127-1689batch: iter_time=2.582e-04, forward_time=0.204, loss_att=41.165, acc=0.963, loss=41.165, backward_time=0.300, grad_norm=85.927, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.411e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:05:47,343 (trainer:732) INFO: 41epoch:train:1690-2252batch: iter_time=2.638e-04, forward_time=0.203, loss_att=42.427, acc=0.962, loss=42.427, backward_time=0.298, grad_norm=83.659, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.406e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:12:01,457 (trainer:732) INFO: 41epoch:train:2253-2815batch: iter_time=2.569e-04, forward_time=0.201, loss_att=41.309, acc=0.962, loss=41.309, backward_time=0.295, grad_norm=92.163, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.401e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:18:17,352 (trainer:732) INFO: 41epoch:train:2816-3378batch: iter_time=2.589e-04, forward_time=0.202, loss_att=40.591, acc=0.963, loss=40.591, backward_time=0.297, grad_norm=88.145, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.396e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:24:31,920 (trainer:732) INFO: 41epoch:train:3379-3941batch: iter_time=2.592e-04, forward_time=0.201, loss_att=41.870, acc=0.961, loss=41.870, backward_time=0.296, grad_norm=83.829, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.390e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:30:47,604 (trainer:732) INFO: 41epoch:train:3942-4504batch: iter_time=2.613e-04, forward_time=0.202, loss_att=41.560, acc=0.961, loss=41.560, backward_time=0.297, grad_norm=92.777, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.385e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:37:03,468 (trainer:732) INFO: 41epoch:train:4505-5067batch: iter_time=2.648e-04, forward_time=0.203, loss_att=42.573, acc=0.961, loss=42.573, backward_time=0.298, grad_norm=83.726, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.380e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:43:19,455 (trainer:732) INFO: 41epoch:train:5068-5630batch: iter_time=2.581e-04, forward_time=0.202, loss_att=42.490, acc=0.961, loss=42.490, backward_time=0.297, grad_norm=78.497, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.375e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:49:35,209 (trainer:732) INFO: 41epoch:train:5631-6193batch: iter_time=2.612e-04, forward_time=0.202, loss_att=41.817, acc=0.962, loss=41.817, backward_time=0.297, grad_norm=93.021, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.370e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 12:55:50,139 (trainer:732) INFO: 41epoch:train:6194-6756batch: iter_time=2.589e-04, forward_time=0.203, loss_att=41.203, acc=0.962, loss=41.203, backward_time=0.297, grad_norm=87.753, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.365e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:02:03,803 (trainer:732) INFO: 41epoch:train:6757-7319batch: iter_time=2.611e-04, forward_time=0.202, loss_att=42.184, acc=0.961, loss=42.184, backward_time=0.296, grad_norm=91.900, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.359e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:08:18,054 (trainer:732) INFO: 41epoch:train:7320-7882batch: iter_time=2.544e-04, forward_time=0.202, loss_att=42.326, acc=0.961, loss=42.326, backward_time=0.296, grad_norm=82.641, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.354e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:14:32,600 (trainer:732) INFO: 41epoch:train:7883-8445batch: iter_time=2.701e-04, forward_time=0.201, loss_att=42.281, acc=0.961, loss=42.281, backward_time=0.296, grad_norm=84.790, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.349e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:20:47,380 (trainer:732) INFO: 41epoch:train:8446-9008batch: iter_time=2.577e-04, forward_time=0.202, loss_att=41.927, acc=0.961, loss=41.927, backward_time=0.297, grad_norm=85.874, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.344e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:27:04,232 (trainer:732) INFO: 41epoch:train:9009-9571batch: iter_time=2.606e-04, forward_time=0.203, loss_att=42.599, acc=0.961, loss=42.599, backward_time=0.299, grad_norm=88.660, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.339e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:33:18,221 (trainer:732) INFO: 41epoch:train:9572-10134batch: iter_time=2.578e-04, forward_time=0.201, loss_att=41.699, acc=0.961, loss=41.699, backward_time=0.295, grad_norm=91.621, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.334e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:39:34,703 (trainer:732) INFO: 41epoch:train:10135-10697batch: iter_time=2.561e-04, forward_time=0.202, loss_att=42.645, acc=0.961, loss=42.645, backward_time=0.298, grad_norm=83.951, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.329e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:45:49,581 (trainer:732) INFO: 41epoch:train:10698-11260batch: iter_time=2.639e-04, forward_time=0.201, loss_att=42.417, acc=0.961, loss=42.417, backward_time=0.296, grad_norm=89.777, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.324e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:53:56,808 (trainer:338) INFO: 41epoch results: [train] iter_time=3.408e-04, forward_time=0.202, loss_att=41.851, acc=0.962, loss=41.851, backward_time=0.297, grad_norm=87.479, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.372e-04, train_time=2.724, time=2 hours, 8 minutes and 7.23 seconds, total_count=462152, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.836, acc=0.982, cer=0.022, wer=0.084, loss=10.836, time=4 minutes and 36.02 seconds, total_count=2296, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.82 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:54:00,438 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:54:00,451 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/31epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 13:54:00,451 (trainer:272) INFO: 42/60epoch started. Estimated time to finish: 1 day, 19 hours and 10 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:02:54,923 (trainer:732) INFO: 42epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=40.702, acc=0.963, loss=40.702, backward_time=0.296, grad_norm=80.583, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.319e-04, train_time=3.804 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:09:10,397 (trainer:732) INFO: 42epoch:train:564-1126batch: iter_time=2.574e-04, forward_time=0.202, loss_att=40.185, acc=0.962, loss=40.185, backward_time=0.297, grad_norm=89.718, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.314e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:15:27,231 (trainer:732) INFO: 42epoch:train:1127-1689batch: iter_time=2.510e-04, forward_time=0.203, loss_att=40.728, acc=0.963, loss=40.728, backward_time=0.298, grad_norm=93.978, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.308e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:21:42,580 (trainer:732) INFO: 42epoch:train:1690-2252batch: iter_time=2.629e-04, forward_time=0.202, loss_att=40.330, acc=0.962, loss=40.330, backward_time=0.297, grad_norm=80.149, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.303e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:27:56,902 (trainer:732) INFO: 42epoch:train:2253-2815batch: iter_time=2.461e-04, forward_time=0.201, loss_att=40.616, acc=0.962, loss=40.616, backward_time=0.296, grad_norm=83.519, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.298e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:34:13,609 (trainer:732) INFO: 42epoch:train:2816-3378batch: iter_time=2.551e-04, forward_time=0.203, loss_att=41.341, acc=0.963, loss=41.341, backward_time=0.298, grad_norm=88.407, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.293e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:40:31,226 (trainer:732) INFO: 42epoch:train:3379-3941batch: iter_time=2.435e-04, forward_time=0.203, loss_att=42.216, acc=0.962, loss=42.216, backward_time=0.299, grad_norm=88.605, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.288e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:46:48,166 (trainer:732) INFO: 42epoch:train:3942-4504batch: iter_time=2.528e-04, forward_time=0.202, loss_att=41.204, acc=0.963, loss=41.204, backward_time=0.298, grad_norm=90.325, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.283e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:53:03,524 (trainer:732) INFO: 42epoch:train:4505-5067batch: iter_time=2.500e-04, forward_time=0.203, loss_att=41.445, acc=0.962, loss=41.445, backward_time=0.298, grad_norm=87.642, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.278e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 14:59:19,589 (trainer:732) INFO: 42epoch:train:5068-5630batch: iter_time=2.487e-04, forward_time=0.202, loss_att=40.617, acc=0.962, loss=40.617, backward_time=0.297, grad_norm=83.960, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.273e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:05:35,508 (trainer:732) INFO: 42epoch:train:5631-6193batch: iter_time=2.539e-04, forward_time=0.203, loss_att=42.717, acc=0.961, loss=42.717, backward_time=0.298, grad_norm=89.640, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.268e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:11:50,454 (trainer:732) INFO: 42epoch:train:6194-6756batch: iter_time=2.450e-04, forward_time=0.201, loss_att=42.206, acc=0.961, loss=42.206, backward_time=0.296, grad_norm=86.866, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.263e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:18:05,944 (trainer:732) INFO: 42epoch:train:6757-7319batch: iter_time=2.440e-04, forward_time=0.203, loss_att=42.587, acc=0.961, loss=42.587, backward_time=0.298, grad_norm=85.590, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.258e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:24:20,423 (trainer:732) INFO: 42epoch:train:7320-7882batch: iter_time=2.446e-04, forward_time=0.201, loss_att=41.513, acc=0.962, loss=41.513, backward_time=0.296, grad_norm=86.284, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.254e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:30:35,142 (trainer:732) INFO: 42epoch:train:7883-8445batch: iter_time=2.628e-04, forward_time=0.202, loss_att=42.109, acc=0.961, loss=42.109, backward_time=0.296, grad_norm=95.792, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.249e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:36:49,368 (trainer:732) INFO: 42epoch:train:8446-9008batch: iter_time=2.452e-04, forward_time=0.202, loss_att=41.961, acc=0.961, loss=41.961, backward_time=0.297, grad_norm=85.075, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.244e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:43:02,848 (trainer:732) INFO: 42epoch:train:9009-9571batch: iter_time=2.580e-04, forward_time=0.202, loss_att=41.069, acc=0.961, loss=41.069, backward_time=0.296, grad_norm=85.511, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.239e-04, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:49:19,139 (trainer:732) INFO: 42epoch:train:9572-10134batch: iter_time=2.498e-04, forward_time=0.202, loss_att=41.887, acc=0.961, loss=41.887, backward_time=0.298, grad_norm=82.334, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.234e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 15:55:34,372 (trainer:732) INFO: 42epoch:train:10135-10697batch: iter_time=2.569e-04, forward_time=0.202, loss_att=42.841, acc=0.961, loss=42.841, backward_time=0.297, grad_norm=89.938, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.229e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:01:49,311 (trainer:732) INFO: 42epoch:train:10698-11260batch: iter_time=2.399e-04, forward_time=0.202, loss_att=41.556, acc=0.962, loss=41.556, backward_time=0.296, grad_norm=90.811, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.224e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:09:54,189 (trainer:338) INFO: 42epoch results: [train] iter_time=3.723e-04, forward_time=0.202, loss_att=41.485, acc=0.962, loss=41.485, backward_time=0.297, grad_norm=87.249, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.271e-04, train_time=2.724, time=2 hours, 8 minutes and 4.78 seconds, total_count=473424, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.698, acc=0.983, cer=0.021, wer=0.082, loss=10.698, time=4 minutes and 34.03 seconds, total_count=2352, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.93 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:09:57,987 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:09:57,999 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/33epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:09:57,999 (trainer:272) INFO: 43/60epoch started. Estimated time to finish: 1 day, 16 hours and 54 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:18:53,709 (trainer:732) INFO: 43epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=39.853, acc=0.964, loss=39.853, backward_time=0.298, grad_norm=84.535, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.219e-04, train_time=3.812 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:25:10,459 (trainer:732) INFO: 43epoch:train:564-1126batch: iter_time=2.697e-04, forward_time=0.203, loss_att=41.012, acc=0.963, loss=41.012, backward_time=0.298, grad_norm=87.673, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.214e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:31:27,366 (trainer:732) INFO: 43epoch:train:1127-1689batch: iter_time=2.670e-04, forward_time=0.203, loss_att=40.397, acc=0.963, loss=40.397, backward_time=0.298, grad_norm=83.300, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.209e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:37:41,936 (trainer:732) INFO: 43epoch:train:1690-2252batch: iter_time=2.614e-04, forward_time=0.202, loss_att=40.472, acc=0.962, loss=40.472, backward_time=0.296, grad_norm=87.972, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.204e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:43:57,359 (trainer:732) INFO: 43epoch:train:2253-2815batch: iter_time=2.605e-04, forward_time=0.202, loss_att=40.491, acc=0.962, loss=40.491, backward_time=0.297, grad_norm=86.785, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.200e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:50:11,984 (trainer:732) INFO: 43epoch:train:2816-3378batch: iter_time=2.636e-04, forward_time=0.202, loss_att=39.411, acc=0.963, loss=39.411, backward_time=0.296, grad_norm=84.358, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.195e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 16:56:25,776 (trainer:732) INFO: 43epoch:train:3379-3941batch: iter_time=2.534e-04, forward_time=0.201, loss_att=40.837, acc=0.962, loss=40.837, backward_time=0.296, grad_norm=89.508, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.190e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:02:40,143 (trainer:732) INFO: 43epoch:train:3942-4504batch: iter_time=2.569e-04, forward_time=0.201, loss_att=40.851, acc=0.962, loss=40.851, backward_time=0.296, grad_norm=84.298, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.185e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:08:55,804 (trainer:732) INFO: 43epoch:train:4505-5067batch: iter_time=2.490e-04, forward_time=0.202, loss_att=41.139, acc=0.962, loss=41.139, backward_time=0.297, grad_norm=87.509, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.180e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:15:11,459 (trainer:732) INFO: 43epoch:train:5068-5630batch: iter_time=2.586e-04, forward_time=0.202, loss_att=40.296, acc=0.963, loss=40.296, backward_time=0.297, grad_norm=86.067, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.175e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:21:25,052 (trainer:732) INFO: 43epoch:train:5631-6193batch: iter_time=2.526e-04, forward_time=0.201, loss_att=40.710, acc=0.962, loss=40.710, backward_time=0.295, grad_norm=87.562, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.171e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:27:41,077 (trainer:732) INFO: 43epoch:train:6194-6756batch: iter_time=2.485e-04, forward_time=0.202, loss_att=40.177, acc=0.963, loss=40.177, backward_time=0.297, grad_norm=84.505, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.166e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:33:58,175 (trainer:732) INFO: 43epoch:train:6757-7319batch: iter_time=2.603e-04, forward_time=0.203, loss_att=42.642, acc=0.961, loss=42.642, backward_time=0.299, grad_norm=88.992, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.161e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:40:14,494 (trainer:732) INFO: 43epoch:train:7320-7882batch: iter_time=2.611e-04, forward_time=0.203, loss_att=41.635, acc=0.962, loss=41.635, backward_time=0.298, grad_norm=84.469, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.156e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:46:29,096 (trainer:732) INFO: 43epoch:train:7883-8445batch: iter_time=2.618e-04, forward_time=0.202, loss_att=42.009, acc=0.962, loss=42.009, backward_time=0.296, grad_norm=81.360, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.151e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:52:46,240 (trainer:732) INFO: 43epoch:train:8446-9008batch: iter_time=2.576e-04, forward_time=0.203, loss_att=41.813, acc=0.962, loss=41.813, backward_time=0.298, grad_norm=86.948, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.147e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 17:59:02,019 (trainer:732) INFO: 43epoch:train:9009-9571batch: iter_time=2.652e-04, forward_time=0.203, loss_att=41.444, acc=0.962, loss=41.444, backward_time=0.298, grad_norm=83.157, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.142e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:05:16,984 (trainer:732) INFO: 43epoch:train:9572-10134batch: iter_time=2.519e-04, forward_time=0.202, loss_att=43.531, acc=0.961, loss=43.531, backward_time=0.297, grad_norm=84.086, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.137e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:11:32,616 (trainer:732) INFO: 43epoch:train:10135-10697batch: iter_time=2.493e-04, forward_time=0.202, loss_att=41.016, acc=0.962, loss=41.016, backward_time=0.297, grad_norm=83.954, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.132e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:17:49,055 (trainer:732) INFO: 43epoch:train:10698-11260batch: iter_time=2.544e-04, forward_time=0.203, loss_att=40.901, acc=0.963, loss=40.901, backward_time=0.298, grad_norm=85.060, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.128e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:26:04,919 (trainer:338) INFO: 43epoch results: [train] iter_time=3.597e-04, forward_time=0.202, loss_att=41.030, acc=0.962, loss=41.030, backward_time=0.297, grad_norm=85.602, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.173e-04, train_time=2.725, time=2 hours, 8 minutes and 9.47 seconds, total_count=484696, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.815, acc=0.983, cer=0.022, wer=0.083, loss=10.815, time=4 minutes and 43.06 seconds, total_count=2408, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.38 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:26:08,801 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:26:08,814 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/29epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:26:08,815 (trainer:272) INFO: 44/60epoch started. Estimated time to finish: 1 day, 14 hours and 37 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:35:05,228 (trainer:732) INFO: 44epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=40.033, acc=0.963, loss=40.033, backward_time=0.297, grad_norm=82.936, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.123e-04, train_time=3.817 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:41:20,465 (trainer:732) INFO: 44epoch:train:564-1126batch: iter_time=2.628e-04, forward_time=0.202, loss_att=40.426, acc=0.963, loss=40.426, backward_time=0.297, grad_norm=83.210, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.118e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:47:36,186 (trainer:732) INFO: 44epoch:train:1127-1689batch: iter_time=2.744e-04, forward_time=0.202, loss_att=39.949, acc=0.963, loss=39.949, backward_time=0.297, grad_norm=83.318, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.114e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 18:53:52,936 (trainer:732) INFO: 44epoch:train:1690-2252batch: iter_time=2.603e-04, forward_time=0.203, loss_att=39.985, acc=0.963, loss=39.985, backward_time=0.298, grad_norm=97.337, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.109e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:00:09,204 (trainer:732) INFO: 44epoch:train:2253-2815batch: iter_time=2.538e-04, forward_time=0.202, loss_att=40.397, acc=0.963, loss=40.397, backward_time=0.298, grad_norm=84.898, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.104e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:06:23,953 (trainer:732) INFO: 44epoch:train:2816-3378batch: iter_time=2.489e-04, forward_time=0.202, loss_att=41.978, acc=0.962, loss=41.978, backward_time=0.297, grad_norm=85.994, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.099e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:12:40,394 (trainer:732) INFO: 44epoch:train:3379-3941batch: iter_time=2.462e-04, forward_time=0.203, loss_att=41.178, acc=0.963, loss=41.178, backward_time=0.298, grad_norm=89.349, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.095e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:18:56,081 (trainer:732) INFO: 44epoch:train:3942-4504batch: iter_time=2.497e-04, forward_time=0.202, loss_att=40.823, acc=0.963, loss=40.823, backward_time=0.297, grad_norm=89.433, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.090e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:25:12,071 (trainer:732) INFO: 44epoch:train:4505-5067batch: iter_time=2.430e-04, forward_time=0.203, loss_att=40.318, acc=0.962, loss=40.318, backward_time=0.298, grad_norm=91.000, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.085e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:31:27,249 (trainer:732) INFO: 44epoch:train:5068-5630batch: iter_time=2.608e-04, forward_time=0.203, loss_att=39.176, acc=0.964, loss=39.176, backward_time=0.298, grad_norm=83.370, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.081e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:37:42,460 (trainer:732) INFO: 44epoch:train:5631-6193batch: iter_time=2.556e-04, forward_time=0.202, loss_att=40.909, acc=0.962, loss=40.909, backward_time=0.297, grad_norm=82.779, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.076e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:43:57,310 (trainer:732) INFO: 44epoch:train:6194-6756batch: iter_time=2.514e-04, forward_time=0.202, loss_att=40.634, acc=0.962, loss=40.634, backward_time=0.297, grad_norm=82.584, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.072e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:50:11,748 (trainer:732) INFO: 44epoch:train:6757-7319batch: iter_time=2.445e-04, forward_time=0.202, loss_att=40.096, acc=0.963, loss=40.096, backward_time=0.297, grad_norm=87.254, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.067e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 19:56:27,027 (trainer:732) INFO: 44epoch:train:7320-7882batch: iter_time=2.376e-04, forward_time=0.202, loss_att=40.626, acc=0.963, loss=40.626, backward_time=0.297, grad_norm=82.014, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.062e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:02:41,851 (trainer:732) INFO: 44epoch:train:7883-8445batch: iter_time=2.430e-04, forward_time=0.202, loss_att=39.663, acc=0.963, loss=39.663, backward_time=0.297, grad_norm=81.854, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.058e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:08:58,020 (trainer:732) INFO: 44epoch:train:8446-9008batch: iter_time=2.439e-04, forward_time=0.202, loss_att=40.864, acc=0.963, loss=40.864, backward_time=0.297, grad_norm=91.297, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.053e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:15:10,600 (trainer:732) INFO: 44epoch:train:9009-9571batch: iter_time=2.463e-04, forward_time=0.201, loss_att=41.193, acc=0.962, loss=41.193, backward_time=0.295, grad_norm=79.963, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.049e-04, train_time=2.648 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:21:24,901 (trainer:732) INFO: 44epoch:train:9572-10134batch: iter_time=2.482e-04, forward_time=0.202, loss_att=40.548, acc=0.962, loss=40.548, backward_time=0.296, grad_norm=82.646, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.044e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:27:41,624 (trainer:732) INFO: 44epoch:train:10135-10697batch: iter_time=2.507e-04, forward_time=0.202, loss_att=41.068, acc=0.963, loss=41.068, backward_time=0.298, grad_norm=88.529, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.039e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:34:00,127 (trainer:732) INFO: 44epoch:train:10698-11260batch: iter_time=2.537e-04, forward_time=0.204, loss_att=42.051, acc=0.962, loss=42.051, backward_time=0.299, grad_norm=86.076, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.035e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:42:02,701 (trainer:338) INFO: 44epoch results: [train] iter_time=3.446e-04, forward_time=0.202, loss_att=40.595, acc=0.963, loss=40.595, backward_time=0.297, grad_norm=85.791, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.079e-04, train_time=2.725, time=2 hours, 8 minutes and 7.19 seconds, total_count=495968, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.588, acc=0.983, cer=0.021, wer=0.082, loss=10.588, time=4 minutes and 30.31 seconds, total_count=2464, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 16.39 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:42:06,773 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:42:06,786 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/32epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:42:06,786 (trainer:272) INFO: 45/60epoch started. Estimated time to finish: 1 day, 12 hours and 21 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:51:07,137 (trainer:732) INFO: 45epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=39.666, acc=0.964, loss=39.666, backward_time=0.297, grad_norm=83.336, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.030e-04, train_time=3.846 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 20:57:24,449 (trainer:732) INFO: 45epoch:train:564-1126batch: iter_time=2.548e-04, forward_time=0.203, loss_att=39.782, acc=0.964, loss=39.782, backward_time=0.299, grad_norm=92.322, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.026e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:03:39,539 (trainer:732) INFO: 45epoch:train:1127-1689batch: iter_time=2.645e-04, forward_time=0.202, loss_att=38.609, acc=0.964, loss=38.609, backward_time=0.297, grad_norm=84.805, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.021e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:09:55,276 (trainer:732) INFO: 45epoch:train:1690-2252batch: iter_time=2.550e-04, forward_time=0.202, loss_att=39.817, acc=0.963, loss=39.817, backward_time=0.297, grad_norm=84.475, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.016e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:16:09,320 (trainer:732) INFO: 45epoch:train:2253-2815batch: iter_time=2.482e-04, forward_time=0.201, loss_att=39.729, acc=0.962, loss=39.729, backward_time=0.296, grad_norm=90.478, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.012e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:22:24,341 (trainer:732) INFO: 45epoch:train:2816-3378batch: iter_time=2.554e-04, forward_time=0.202, loss_att=39.725, acc=0.963, loss=39.725, backward_time=0.297, grad_norm=84.103, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.007e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:28:40,289 (trainer:732) INFO: 45epoch:train:3379-3941batch: iter_time=2.505e-04, forward_time=0.202, loss_att=40.321, acc=0.963, loss=40.321, backward_time=0.297, grad_norm=83.820, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.003e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:34:57,950 (trainer:732) INFO: 45epoch:train:3942-4504batch: iter_time=2.626e-04, forward_time=0.203, loss_att=40.211, acc=0.963, loss=40.211, backward_time=0.299, grad_norm=88.759, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.998e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:41:12,377 (trainer:732) INFO: 45epoch:train:4505-5067batch: iter_time=2.673e-04, forward_time=0.202, loss_att=39.289, acc=0.963, loss=39.289, backward_time=0.296, grad_norm=83.487, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.994e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:47:29,864 (trainer:732) INFO: 45epoch:train:5068-5630batch: iter_time=2.699e-04, forward_time=0.203, loss_att=40.966, acc=0.963, loss=40.966, backward_time=0.298, grad_norm=93.144, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.989e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 21:53:46,808 (trainer:732) INFO: 45epoch:train:5631-6193batch: iter_time=2.526e-04, forward_time=0.203, loss_att=41.622, acc=0.963, loss=41.622, backward_time=0.299, grad_norm=84.106, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.985e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:00:01,416 (trainer:732) INFO: 45epoch:train:6194-6756batch: iter_time=2.531e-04, forward_time=0.202, loss_att=39.885, acc=0.963, loss=39.885, backward_time=0.297, grad_norm=94.745, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.980e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:06:16,932 (trainer:732) INFO: 45epoch:train:6757-7319batch: iter_time=2.451e-04, forward_time=0.202, loss_att=41.318, acc=0.962, loss=41.318, backward_time=0.297, grad_norm=85.818, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.976e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:12:30,570 (trainer:732) INFO: 45epoch:train:7320-7882batch: iter_time=2.518e-04, forward_time=0.201, loss_att=39.660, acc=0.963, loss=39.660, backward_time=0.295, grad_norm=84.491, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.972e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:18:45,697 (trainer:732) INFO: 45epoch:train:7883-8445batch: iter_time=2.535e-04, forward_time=0.202, loss_att=40.622, acc=0.963, loss=40.622, backward_time=0.296, grad_norm=88.143, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.967e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:25:01,635 (trainer:732) INFO: 45epoch:train:8446-9008batch: iter_time=2.528e-04, forward_time=0.202, loss_att=39.883, acc=0.963, loss=39.883, backward_time=0.297, grad_norm=83.607, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.963e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:31:18,521 (trainer:732) INFO: 45epoch:train:9009-9571batch: iter_time=2.527e-04, forward_time=0.203, loss_att=41.239, acc=0.962, loss=41.239, backward_time=0.298, grad_norm=85.811, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.958e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:37:32,146 (trainer:732) INFO: 45epoch:train:9572-10134batch: iter_time=2.559e-04, forward_time=0.201, loss_att=41.028, acc=0.962, loss=41.028, backward_time=0.295, grad_norm=85.029, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.954e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:43:48,298 (trainer:732) INFO: 45epoch:train:10135-10697batch: iter_time=2.484e-04, forward_time=0.202, loss_att=40.222, acc=0.963, loss=40.222, backward_time=0.297, grad_norm=82.648, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.949e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:50:04,986 (trainer:732) INFO: 45epoch:train:10698-11260batch: iter_time=2.413e-04, forward_time=0.203, loss_att=41.087, acc=0.963, loss=41.087, backward_time=0.298, grad_norm=82.600, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.945e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:58:18,718 (trainer:338) INFO: 45epoch results: [train] iter_time=3.828e-04, forward_time=0.202, loss_att=40.234, acc=0.963, loss=40.234, backward_time=0.297, grad_norm=86.261, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.987e-04, train_time=2.727, time=2 hours, 8 minutes and 15.7 seconds, total_count=507240, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.576, acc=0.983, cer=0.021, wer=0.081, loss=10.576, time=4 minutes and 42.81 seconds, total_count=2520, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.42 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:58:22,977 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:58:22,989 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/38epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 22:58:22,989 (trainer:272) INFO: 46/60epoch started. Estimated time to finish: 1 day, 10 hours and 4 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:07:20,813 (trainer:732) INFO: 46epoch:train:1-563batch: iter_time=0.003, forward_time=0.203, loss_att=38.575, acc=0.964, loss=38.575, backward_time=0.297, grad_norm=79.333, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.940e-04, train_time=3.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:13:36,311 (trainer:732) INFO: 46epoch:train:564-1126batch: iter_time=3.060e-04, forward_time=0.202, loss_att=39.197, acc=0.964, loss=39.197, backward_time=0.296, grad_norm=84.934, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.936e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:19:54,050 (trainer:732) INFO: 46epoch:train:1127-1689batch: iter_time=2.892e-04, forward_time=0.203, loss_att=40.231, acc=0.964, loss=40.231, backward_time=0.298, grad_norm=85.002, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.932e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:26:09,136 (trainer:732) INFO: 46epoch:train:1690-2252batch: iter_time=2.811e-04, forward_time=0.202, loss_att=39.980, acc=0.963, loss=39.980, backward_time=0.297, grad_norm=88.475, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.927e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:32:24,782 (trainer:732) INFO: 46epoch:train:2253-2815batch: iter_time=2.715e-04, forward_time=0.203, loss_att=39.177, acc=0.964, loss=39.177, backward_time=0.297, grad_norm=77.357, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.923e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:38:40,335 (trainer:732) INFO: 46epoch:train:2816-3378batch: iter_time=2.875e-04, forward_time=0.203, loss_att=39.907, acc=0.963, loss=39.907, backward_time=0.297, grad_norm=83.917, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.919e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:44:58,932 (trainer:732) INFO: 46epoch:train:3379-3941batch: iter_time=2.760e-04, forward_time=0.204, loss_att=40.415, acc=0.963, loss=40.415, backward_time=0.299, grad_norm=85.056, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.914e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:51:13,336 (trainer:732) INFO: 46epoch:train:3942-4504batch: iter_time=2.854e-04, forward_time=0.201, loss_att=39.526, acc=0.963, loss=39.526, backward_time=0.295, grad_norm=84.998, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.910e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-25 23:57:27,813 (trainer:732) INFO: 46epoch:train:4505-5067batch: iter_time=2.697e-04, forward_time=0.202, loss_att=39.976, acc=0.963, loss=39.976, backward_time=0.297, grad_norm=82.114, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.905e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:03:42,592 (trainer:732) INFO: 46epoch:train:5068-5630batch: iter_time=2.752e-04, forward_time=0.202, loss_att=39.527, acc=0.964, loss=39.527, backward_time=0.297, grad_norm=85.212, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.901e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:09:59,251 (trainer:732) INFO: 46epoch:train:5631-6193batch: iter_time=2.635e-04, forward_time=0.202, loss_att=39.607, acc=0.963, loss=39.607, backward_time=0.298, grad_norm=86.915, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.897e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:16:16,384 (trainer:732) INFO: 46epoch:train:6194-6756batch: iter_time=2.674e-04, forward_time=0.203, loss_att=40.934, acc=0.963, loss=40.934, backward_time=0.298, grad_norm=102.131, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.892e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:22:31,347 (trainer:732) INFO: 46epoch:train:6757-7319batch: iter_time=2.718e-04, forward_time=0.202, loss_att=40.722, acc=0.963, loss=40.722, backward_time=0.296, grad_norm=88.077, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.888e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:28:47,793 (trainer:732) INFO: 46epoch:train:7320-7882batch: iter_time=2.831e-04, forward_time=0.202, loss_att=40.611, acc=0.964, loss=40.611, backward_time=0.298, grad_norm=87.658, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.884e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:35:04,051 (trainer:732) INFO: 46epoch:train:7883-8445batch: iter_time=2.746e-04, forward_time=0.202, loss_att=40.135, acc=0.963, loss=40.135, backward_time=0.298, grad_norm=91.021, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.880e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:41:18,092 (trainer:732) INFO: 46epoch:train:8446-9008batch: iter_time=2.680e-04, forward_time=0.201, loss_att=39.907, acc=0.963, loss=39.907, backward_time=0.296, grad_norm=81.944, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.875e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:47:32,628 (trainer:732) INFO: 46epoch:train:9009-9571batch: iter_time=2.724e-04, forward_time=0.201, loss_att=38.733, acc=0.964, loss=38.733, backward_time=0.296, grad_norm=82.268, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.871e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 00:53:49,289 (trainer:732) INFO: 46epoch:train:9572-10134batch: iter_time=2.861e-04, forward_time=0.203, loss_att=39.115, acc=0.964, loss=39.115, backward_time=0.298, grad_norm=86.226, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.867e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:00:05,431 (trainer:732) INFO: 46epoch:train:10135-10697batch: iter_time=2.749e-04, forward_time=0.202, loss_att=40.392, acc=0.963, loss=40.392, backward_time=0.298, grad_norm=89.773, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.862e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:06:20,626 (trainer:732) INFO: 46epoch:train:10698-11260batch: iter_time=2.684e-04, forward_time=0.202, loss_att=39.716, acc=0.963, loss=39.716, backward_time=0.297, grad_norm=88.823, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.858e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:14:46,810 (trainer:338) INFO: 46epoch results: [train] iter_time=3.939e-04, forward_time=0.202, loss_att=39.809, acc=0.963, loss=39.809, backward_time=0.297, grad_norm=86.050, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.899e-04, train_time=2.727, time=2 hours, 8 minutes and 15.46 seconds, total_count=518512, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.787, acc=0.983, cer=0.022, wer=0.082, loss=10.787, time=4 minutes and 52.48 seconds, total_count=2576, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.86 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:14:50,581 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:14:50,596 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/35epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:14:50,596 (trainer:272) INFO: 47/60epoch started. Estimated time to finish: 1 day, 7 hours and 48 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:23:48,109 (trainer:732) INFO: 47epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=39.169, acc=0.964, loss=39.169, backward_time=0.298, grad_norm=84.066, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.854e-04, train_time=3.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:30:03,411 (trainer:732) INFO: 47epoch:train:564-1126batch: iter_time=2.737e-04, forward_time=0.202, loss_att=39.950, acc=0.964, loss=39.950, backward_time=0.297, grad_norm=83.668, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.849e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:36:17,773 (trainer:732) INFO: 47epoch:train:1127-1689batch: iter_time=2.738e-04, forward_time=0.201, loss_att=38.704, acc=0.964, loss=38.704, backward_time=0.296, grad_norm=87.839, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.845e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:42:33,895 (trainer:732) INFO: 47epoch:train:1690-2252batch: iter_time=2.682e-04, forward_time=0.203, loss_att=39.123, acc=0.964, loss=39.123, backward_time=0.297, grad_norm=81.315, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.841e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:48:49,444 (trainer:732) INFO: 47epoch:train:2253-2815batch: iter_time=2.732e-04, forward_time=0.202, loss_att=40.045, acc=0.964, loss=40.045, backward_time=0.297, grad_norm=85.193, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.837e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 01:55:04,842 (trainer:732) INFO: 47epoch:train:2816-3378batch: iter_time=2.751e-04, forward_time=0.202, loss_att=39.078, acc=0.964, loss=39.078, backward_time=0.297, grad_norm=82.780, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.833e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:01:21,303 (trainer:732) INFO: 47epoch:train:3379-3941batch: iter_time=2.799e-04, forward_time=0.203, loss_att=39.398, acc=0.964, loss=39.398, backward_time=0.298, grad_norm=96.600, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.828e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:07:38,292 (trainer:732) INFO: 47epoch:train:3942-4504batch: iter_time=2.840e-04, forward_time=0.203, loss_att=39.316, acc=0.964, loss=39.316, backward_time=0.298, grad_norm=94.392, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.824e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:13:54,255 (trainer:732) INFO: 47epoch:train:4505-5067batch: iter_time=2.714e-04, forward_time=0.203, loss_att=39.439, acc=0.964, loss=39.439, backward_time=0.298, grad_norm=82.584, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.820e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:20:10,407 (trainer:732) INFO: 47epoch:train:5068-5630batch: iter_time=2.753e-04, forward_time=0.202, loss_att=38.956, acc=0.964, loss=38.956, backward_time=0.297, grad_norm=86.918, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.816e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:26:25,667 (trainer:732) INFO: 47epoch:train:5631-6193batch: iter_time=2.682e-04, forward_time=0.201, loss_att=38.429, acc=0.964, loss=38.429, backward_time=0.296, grad_norm=86.377, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.811e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:32:41,979 (trainer:732) INFO: 47epoch:train:6194-6756batch: iter_time=2.681e-04, forward_time=0.203, loss_att=39.543, acc=0.963, loss=39.543, backward_time=0.298, grad_norm=85.686, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.807e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:38:58,548 (trainer:732) INFO: 47epoch:train:6757-7319batch: iter_time=2.653e-04, forward_time=0.203, loss_att=38.713, acc=0.964, loss=38.713, backward_time=0.298, grad_norm=83.818, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.803e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:45:14,353 (trainer:732) INFO: 47epoch:train:7320-7882batch: iter_time=2.685e-04, forward_time=0.202, loss_att=39.483, acc=0.964, loss=39.483, backward_time=0.297, grad_norm=80.948, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.799e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:51:29,483 (trainer:732) INFO: 47epoch:train:7883-8445batch: iter_time=2.707e-04, forward_time=0.202, loss_att=39.416, acc=0.964, loss=39.416, backward_time=0.296, grad_norm=85.868, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.795e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 02:57:45,380 (trainer:732) INFO: 47epoch:train:8446-9008batch: iter_time=2.705e-04, forward_time=0.202, loss_att=38.727, acc=0.964, loss=38.727, backward_time=0.297, grad_norm=83.162, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.791e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:04:00,089 (trainer:732) INFO: 47epoch:train:9009-9571batch: iter_time=2.704e-04, forward_time=0.202, loss_att=39.611, acc=0.963, loss=39.611, backward_time=0.297, grad_norm=85.401, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.786e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:10:15,250 (trainer:732) INFO: 47epoch:train:9572-10134batch: iter_time=2.720e-04, forward_time=0.202, loss_att=41.390, acc=0.962, loss=41.390, backward_time=0.296, grad_norm=88.495, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.782e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:16:29,218 (trainer:732) INFO: 47epoch:train:10135-10697batch: iter_time=2.613e-04, forward_time=0.202, loss_att=40.643, acc=0.963, loss=40.643, backward_time=0.296, grad_norm=83.975, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.778e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:22:45,686 (trainer:732) INFO: 47epoch:train:10698-11260batch: iter_time=2.625e-04, forward_time=0.202, loss_att=40.119, acc=0.963, loss=40.119, backward_time=0.297, grad_norm=84.116, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.774e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:31:01,970 (trainer:338) INFO: 47epoch results: [train] iter_time=3.819e-04, forward_time=0.202, loss_att=39.468, acc=0.964, loss=39.468, backward_time=0.297, grad_norm=85.667, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.814e-04, train_time=2.726, time=2 hours, 8 minutes and 12.76 seconds, total_count=529784, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.664, acc=0.983, cer=0.021, wer=0.082, loss=10.664, time=4 minutes and 44.82 seconds, total_count=2632, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.78 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:31:05,942 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:31:05,956 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/36epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:31:05,957 (trainer:272) INFO: 48/60epoch started. Estimated time to finish: 1 day, 5 hours and 32 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:40:00,926 (trainer:732) INFO: 48epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=38.116, acc=0.964, loss=38.116, backward_time=0.296, grad_norm=77.556, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.770e-04, train_time=3.808 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:46:17,670 (trainer:732) INFO: 48epoch:train:564-1126batch: iter_time=2.826e-04, forward_time=0.203, loss_att=38.712, acc=0.965, loss=38.712, backward_time=0.298, grad_norm=85.697, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.766e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:52:32,211 (trainer:732) INFO: 48epoch:train:1127-1689batch: iter_time=2.795e-04, forward_time=0.202, loss_att=37.844, acc=0.964, loss=37.844, backward_time=0.296, grad_norm=78.584, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.762e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 03:58:47,620 (trainer:732) INFO: 48epoch:train:1690-2252batch: iter_time=2.766e-04, forward_time=0.202, loss_att=38.212, acc=0.964, loss=38.212, backward_time=0.297, grad_norm=81.791, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.757e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:05:03,831 (trainer:732) INFO: 48epoch:train:2253-2815batch: iter_time=2.706e-04, forward_time=0.202, loss_att=38.559, acc=0.965, loss=38.559, backward_time=0.298, grad_norm=87.777, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.753e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:11:19,710 (trainer:732) INFO: 48epoch:train:2816-3378batch: iter_time=2.749e-04, forward_time=0.202, loss_att=38.642, acc=0.964, loss=38.642, backward_time=0.297, grad_norm=89.582, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.749e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:17:36,217 (trainer:732) INFO: 48epoch:train:3379-3941batch: iter_time=2.735e-04, forward_time=0.202, loss_att=37.968, acc=0.965, loss=37.968, backward_time=0.298, grad_norm=88.467, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.745e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:23:51,013 (trainer:732) INFO: 48epoch:train:3942-4504batch: iter_time=2.778e-04, forward_time=0.202, loss_att=39.092, acc=0.964, loss=39.092, backward_time=0.296, grad_norm=85.599, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.741e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:30:05,639 (trainer:732) INFO: 48epoch:train:4505-5067batch: iter_time=2.802e-04, forward_time=0.201, loss_att=38.937, acc=0.964, loss=38.937, backward_time=0.296, grad_norm=83.761, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.737e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:36:20,502 (trainer:732) INFO: 48epoch:train:5068-5630batch: iter_time=2.821e-04, forward_time=0.202, loss_att=39.046, acc=0.964, loss=39.046, backward_time=0.297, grad_norm=83.559, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.733e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:42:36,918 (trainer:732) INFO: 48epoch:train:5631-6193batch: iter_time=2.706e-04, forward_time=0.203, loss_att=40.643, acc=0.963, loss=40.643, backward_time=0.298, grad_norm=95.985, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.729e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:48:52,681 (trainer:732) INFO: 48epoch:train:6194-6756batch: iter_time=2.687e-04, forward_time=0.202, loss_att=39.719, acc=0.964, loss=39.719, backward_time=0.297, grad_norm=83.474, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.725e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 04:55:08,632 (trainer:732) INFO: 48epoch:train:6757-7319batch: iter_time=2.698e-04, forward_time=0.203, loss_att=37.755, acc=0.965, loss=37.755, backward_time=0.298, grad_norm=84.385, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.721e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:01:25,168 (trainer:732) INFO: 48epoch:train:7320-7882batch: iter_time=2.715e-04, forward_time=0.203, loss_att=41.039, acc=0.963, loss=41.039, backward_time=0.298, grad_norm=88.917, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.717e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:07:40,255 (trainer:732) INFO: 48epoch:train:7883-8445batch: iter_time=2.771e-04, forward_time=0.202, loss_att=39.242, acc=0.964, loss=39.242, backward_time=0.296, grad_norm=80.113, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.713e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:13:57,173 (trainer:732) INFO: 48epoch:train:8446-9008batch: iter_time=2.712e-04, forward_time=0.203, loss_att=38.936, acc=0.964, loss=38.936, backward_time=0.298, grad_norm=85.047, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.709e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:20:13,405 (trainer:732) INFO: 48epoch:train:9009-9571batch: iter_time=2.694e-04, forward_time=0.202, loss_att=39.445, acc=0.964, loss=39.445, backward_time=0.298, grad_norm=84.891, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.705e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:26:26,746 (trainer:732) INFO: 48epoch:train:9572-10134batch: iter_time=2.680e-04, forward_time=0.201, loss_att=39.057, acc=0.963, loss=39.057, backward_time=0.295, grad_norm=82.695, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.701e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:32:43,446 (trainer:732) INFO: 48epoch:train:10135-10697batch: iter_time=2.677e-04, forward_time=0.203, loss_att=40.252, acc=0.964, loss=40.252, backward_time=0.298, grad_norm=86.714, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.697e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:39:00,953 (trainer:732) INFO: 48epoch:train:10698-11260batch: iter_time=2.632e-04, forward_time=0.203, loss_att=40.182, acc=0.964, loss=40.182, backward_time=0.299, grad_norm=87.058, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.693e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:47:19,982 (trainer:338) INFO: 48epoch results: [train] iter_time=3.856e-04, forward_time=0.202, loss_att=39.054, acc=0.964, loss=39.054, backward_time=0.297, grad_norm=85.052, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.731e-04, train_time=2.726, time=2 hours, 8 minutes and 12.6 seconds, total_count=541056, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.390, acc=0.983, cer=0.021, wer=0.080, loss=10.390, time=4 minutes and 47.34 seconds, total_count=2688, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 14.08 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:47:23,826 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:47:23,841 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/39epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:47:23,841 (trainer:272) INFO: 49/60epoch started. Estimated time to finish: 1 day, 3 hours and 15 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 05:56:24,830 (trainer:732) INFO: 49epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=38.456, acc=0.965, loss=38.456, backward_time=0.299, grad_norm=85.508, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.688e-04, train_time=3.851 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:02:39,897 (trainer:732) INFO: 49epoch:train:564-1126batch: iter_time=2.501e-04, forward_time=0.202, loss_att=37.964, acc=0.965, loss=37.964, backward_time=0.296, grad_norm=84.353, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.684e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:08:56,659 (trainer:732) INFO: 49epoch:train:1127-1689batch: iter_time=2.552e-04, forward_time=0.203, loss_att=38.353, acc=0.965, loss=38.353, backward_time=0.298, grad_norm=87.184, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.680e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:15:12,333 (trainer:732) INFO: 49epoch:train:1690-2252batch: iter_time=2.450e-04, forward_time=0.202, loss_att=38.996, acc=0.965, loss=38.996, backward_time=0.297, grad_norm=78.760, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.676e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:21:29,161 (trainer:732) INFO: 49epoch:train:2253-2815batch: iter_time=2.394e-04, forward_time=0.203, loss_att=39.361, acc=0.964, loss=39.361, backward_time=0.299, grad_norm=97.233, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.673e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:27:45,873 (trainer:732) INFO: 49epoch:train:2816-3378batch: iter_time=2.542e-04, forward_time=0.202, loss_att=37.384, acc=0.965, loss=37.384, backward_time=0.298, grad_norm=84.163, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.669e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:34:01,450 (trainer:732) INFO: 49epoch:train:3379-3941batch: iter_time=2.604e-04, forward_time=0.202, loss_att=38.294, acc=0.965, loss=38.294, backward_time=0.297, grad_norm=78.816, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.665e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:40:19,304 (trainer:732) INFO: 49epoch:train:3942-4504batch: iter_time=2.569e-04, forward_time=0.203, loss_att=39.191, acc=0.964, loss=39.191, backward_time=0.299, grad_norm=84.374, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.661e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:46:33,501 (trainer:732) INFO: 49epoch:train:4505-5067batch: iter_time=2.452e-04, forward_time=0.202, loss_att=39.193, acc=0.964, loss=39.193, backward_time=0.296, grad_norm=76.027, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.657e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:52:48,445 (trainer:732) INFO: 49epoch:train:5068-5630batch: iter_time=2.588e-04, forward_time=0.202, loss_att=38.199, acc=0.965, loss=38.199, backward_time=0.297, grad_norm=82.266, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.653e-04, train_time=2.662 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<62967> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<63019> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<32298> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<32442> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 06:59:05,062 (trainer:732) INFO: 49epoch:train:5631-6193batch: iter_time=2.417e-04, forward_time=0.203, loss_att=38.469, acc=0.965, loss=38.469, backward_time=0.298, grad_norm=85.368, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.649e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<49660> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<49768> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<18470> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<18580> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:05:19,294 (trainer:732) INFO: 49epoch:train:6194-6756batch: iter_time=2.464e-04, forward_time=0.201, loss_att=38.638, acc=0.964, loss=38.638, backward_time=0.296, grad_norm=91.053, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.645e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:11:35,249 (trainer:732) INFO: 49epoch:train:6757-7319batch: iter_time=2.490e-04, forward_time=0.203, loss_att=39.799, acc=0.964, loss=39.799, backward_time=0.298, grad_norm=87.552, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.641e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:17:49,372 (trainer:732) INFO: 49epoch:train:7320-7882batch: iter_time=2.444e-04, forward_time=0.201, loss_att=38.948, acc=0.964, loss=38.948, backward_time=0.296, grad_norm=85.487, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.637e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:24:03,070 (trainer:732) INFO: 49epoch:train:7883-8445batch: iter_time=2.478e-04, forward_time=0.201, loss_att=39.309, acc=0.964, loss=39.309, backward_time=0.295, grad_norm=84.655, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.633e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:30:18,805 (trainer:732) INFO: 49epoch:train:8446-9008batch: iter_time=2.463e-04, forward_time=0.202, loss_att=39.104, acc=0.965, loss=39.104, backward_time=0.298, grad_norm=87.797, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.629e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:36:33,306 (trainer:732) INFO: 49epoch:train:9009-9571batch: iter_time=2.561e-04, forward_time=0.202, loss_att=38.479, acc=0.964, loss=38.479, backward_time=0.296, grad_norm=89.972, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.625e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:42:47,197 (trainer:732) INFO: 49epoch:train:9572-10134batch: iter_time=2.462e-04, forward_time=0.201, loss_att=37.475, acc=0.964, loss=37.475, backward_time=0.296, grad_norm=91.151, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.621e-04, train_time=2.656 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:49:03,012 (trainer:732) INFO: 49epoch:train:10135-10697batch: iter_time=2.438e-04, forward_time=0.202, loss_att=38.438, acc=0.964, loss=38.438, backward_time=0.297, grad_norm=84.886, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.617e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 07:55:18,107 (trainer:732) INFO: 49epoch:train:10698-11260batch: iter_time=2.405e-04, forward_time=0.202, loss_att=39.420, acc=0.964, loss=39.420, backward_time=0.297, grad_norm=90.248, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.614e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:03:25,838 (trainer:338) INFO: 49epoch results: [train] iter_time=3.541e-04, forward_time=0.202, loss_att=38.683, acc=0.964, loss=38.683, backward_time=0.297, grad_norm=85.827, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.651e-04, train_time=2.726, time=2 hours, 8 minutes and 10.83 seconds, total_count=552328, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.418, acc=0.983, cer=0.021, wer=0.081, loss=10.418, time=4 minutes and 37.48 seconds, total_count=2744, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 13.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:03:29,599 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:03:29,613 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/41epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:03:29,613 (trainer:272) INFO: 50/60epoch started. Estimated time to finish: 1 day, 59 minutes and 29.27 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:12:24,591 (trainer:732) INFO: 50epoch:train:1-563batch: iter_time=0.003, forward_time=0.201, loss_att=36.722, acc=0.965, loss=36.722, backward_time=0.295, grad_norm=81.407, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.610e-04, train_time=3.808 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:18:39,321 (trainer:732) INFO: 50epoch:train:564-1126batch: iter_time=2.801e-04, forward_time=0.202, loss_att=37.891, acc=0.965, loss=37.891, backward_time=0.296, grad_norm=82.102, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.606e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:24:55,257 (trainer:732) INFO: 50epoch:train:1127-1689batch: iter_time=2.772e-04, forward_time=0.203, loss_att=38.297, acc=0.965, loss=38.297, backward_time=0.298, grad_norm=88.916, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.602e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:31:11,226 (trainer:732) INFO: 50epoch:train:1690-2252batch: iter_time=2.711e-04, forward_time=0.203, loss_att=37.864, acc=0.965, loss=37.864, backward_time=0.298, grad_norm=94.335, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.598e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:37:28,532 (trainer:732) INFO: 50epoch:train:2253-2815batch: iter_time=2.757e-04, forward_time=0.203, loss_att=38.840, acc=0.965, loss=38.840, backward_time=0.299, grad_norm=92.379, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.594e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:43:43,496 (trainer:732) INFO: 50epoch:train:2816-3378batch: iter_time=2.671e-04, forward_time=0.202, loss_att=38.532, acc=0.964, loss=38.532, backward_time=0.297, grad_norm=86.904, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.590e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:49:58,120 (trainer:732) INFO: 50epoch:train:3379-3941batch: iter_time=2.692e-04, forward_time=0.201, loss_att=38.546, acc=0.965, loss=38.546, backward_time=0.296, grad_norm=81.540, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.586e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 08:56:15,748 (trainer:732) INFO: 50epoch:train:3942-4504batch: iter_time=2.638e-04, forward_time=0.203, loss_att=38.007, acc=0.965, loss=38.007, backward_time=0.299, grad_norm=84.357, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.583e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:02:31,831 (trainer:732) INFO: 50epoch:train:4505-5067batch: iter_time=2.641e-04, forward_time=0.202, loss_att=38.294, acc=0.965, loss=38.294, backward_time=0.298, grad_norm=78.758, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.579e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:08:47,304 (trainer:732) INFO: 50epoch:train:5068-5630batch: iter_time=2.717e-04, forward_time=0.202, loss_att=39.013, acc=0.964, loss=39.013, backward_time=0.297, grad_norm=80.938, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.575e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:15:02,711 (trainer:732) INFO: 50epoch:train:5631-6193batch: iter_time=2.797e-04, forward_time=0.202, loss_att=39.188, acc=0.964, loss=39.188, backward_time=0.297, grad_norm=85.789, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.571e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:21:16,004 (trainer:732) INFO: 50epoch:train:6194-6756batch: iter_time=2.718e-04, forward_time=0.201, loss_att=37.882, acc=0.964, loss=37.882, backward_time=0.295, grad_norm=83.569, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.567e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:27:31,035 (trainer:732) INFO: 50epoch:train:6757-7319batch: iter_time=2.807e-04, forward_time=0.202, loss_att=38.404, acc=0.964, loss=38.404, backward_time=0.297, grad_norm=83.353, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.564e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:33:46,855 (trainer:732) INFO: 50epoch:train:7320-7882batch: iter_time=2.778e-04, forward_time=0.202, loss_att=38.557, acc=0.964, loss=38.557, backward_time=0.297, grad_norm=83.432, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.560e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:40:03,061 (trainer:732) INFO: 50epoch:train:7883-8445batch: iter_time=2.722e-04, forward_time=0.203, loss_att=37.746, acc=0.965, loss=37.746, backward_time=0.298, grad_norm=83.788, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.556e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:46:22,347 (trainer:732) INFO: 50epoch:train:8446-9008batch: iter_time=2.828e-04, forward_time=0.204, loss_att=39.801, acc=0.964, loss=39.801, backward_time=0.300, grad_norm=86.729, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.552e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:52:36,764 (trainer:732) INFO: 50epoch:train:9009-9571batch: iter_time=2.804e-04, forward_time=0.202, loss_att=38.322, acc=0.964, loss=38.322, backward_time=0.296, grad_norm=80.041, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.548e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 09:58:51,066 (trainer:732) INFO: 50epoch:train:9572-10134batch: iter_time=2.761e-04, forward_time=0.201, loss_att=37.919, acc=0.965, loss=37.919, backward_time=0.296, grad_norm=81.220, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.545e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:05:07,971 (trainer:732) INFO: 50epoch:train:10135-10697batch: iter_time=2.775e-04, forward_time=0.203, loss_att=39.632, acc=0.964, loss=39.632, backward_time=0.298, grad_norm=90.342, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.541e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:11:23,100 (trainer:732) INFO: 50epoch:train:10698-11260batch: iter_time=2.679e-04, forward_time=0.202, loss_att=38.357, acc=0.964, loss=38.357, backward_time=0.297, grad_norm=92.138, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.537e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:19:27,464 (trainer:338) INFO: 50epoch results: [train] iter_time=4.000e-04, forward_time=0.202, loss_att=38.384, acc=0.965, loss=38.384, backward_time=0.297, grad_norm=85.122, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.573e-04, train_time=2.725, time=2 hours, 8 minutes and 9.02 seconds, total_count=563600, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.591, acc=0.983, cer=0.021, wer=0.081, loss=10.591, time=4 minutes and 33.56 seconds, total_count=2800, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.26 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:19:31,277 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:19:31,293 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/37epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:19:31,293 (trainer:272) INFO: 51/60epoch started. Estimated time to finish: 22 hours, 43 minutes and 6.78 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:28:26,789 (trainer:732) INFO: 51epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=37.517, acc=0.965, loss=37.517, backward_time=0.297, grad_norm=79.938, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.533e-04, train_time=3.811 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:34:41,542 (trainer:732) INFO: 51epoch:train:564-1126batch: iter_time=2.698e-04, forward_time=0.202, loss_att=36.580, acc=0.966, loss=36.580, backward_time=0.296, grad_norm=87.098, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.529e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:40:56,402 (trainer:732) INFO: 51epoch:train:1127-1689batch: iter_time=2.698e-04, forward_time=0.202, loss_att=37.277, acc=0.965, loss=37.277, backward_time=0.297, grad_norm=82.978, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.526e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:47:11,384 (trainer:732) INFO: 51epoch:train:1690-2252batch: iter_time=2.681e-04, forward_time=0.202, loss_att=36.468, acc=0.966, loss=36.468, backward_time=0.296, grad_norm=80.659, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.522e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:53:24,840 (trainer:732) INFO: 51epoch:train:2253-2815batch: iter_time=2.676e-04, forward_time=0.202, loss_att=38.128, acc=0.964, loss=38.128, backward_time=0.296, grad_norm=84.377, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.518e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 10:59:39,506 (trainer:732) INFO: 51epoch:train:2816-3378batch: iter_time=2.652e-04, forward_time=0.202, loss_att=37.631, acc=0.965, loss=37.631, backward_time=0.296, grad_norm=82.193, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.514e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:05:53,254 (trainer:732) INFO: 51epoch:train:3379-3941batch: iter_time=2.767e-04, forward_time=0.202, loss_att=38.197, acc=0.964, loss=38.197, backward_time=0.296, grad_norm=80.562, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.511e-04, train_time=2.655 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:12:10,204 (trainer:732) INFO: 51epoch:train:3942-4504batch: iter_time=2.764e-04, forward_time=0.203, loss_att=39.081, acc=0.965, loss=39.081, backward_time=0.298, grad_norm=85.016, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.507e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:18:25,976 (trainer:732) INFO: 51epoch:train:4505-5067batch: iter_time=2.672e-04, forward_time=0.203, loss_att=37.458, acc=0.966, loss=37.458, backward_time=0.297, grad_norm=82.418, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.503e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:24:42,702 (trainer:732) INFO: 51epoch:train:5068-5630batch: iter_time=2.739e-04, forward_time=0.203, loss_att=38.296, acc=0.965, loss=38.296, backward_time=0.298, grad_norm=92.060, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.500e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:30:58,967 (trainer:732) INFO: 51epoch:train:5631-6193batch: iter_time=2.788e-04, forward_time=0.203, loss_att=38.215, acc=0.965, loss=38.215, backward_time=0.298, grad_norm=84.944, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.496e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:37:14,278 (trainer:732) INFO: 51epoch:train:6194-6756batch: iter_time=2.702e-04, forward_time=0.202, loss_att=39.237, acc=0.964, loss=39.237, backward_time=0.297, grad_norm=87.778, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.492e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:43:29,236 (trainer:732) INFO: 51epoch:train:6757-7319batch: iter_time=2.658e-04, forward_time=0.202, loss_att=38.348, acc=0.964, loss=38.348, backward_time=0.296, grad_norm=81.006, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.488e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:49:45,144 (trainer:732) INFO: 51epoch:train:7320-7882batch: iter_time=2.641e-04, forward_time=0.202, loss_att=38.178, acc=0.965, loss=38.178, backward_time=0.297, grad_norm=83.594, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.485e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 11:56:00,393 (trainer:732) INFO: 51epoch:train:7883-8445batch: iter_time=2.741e-04, forward_time=0.202, loss_att=37.618, acc=0.965, loss=37.618, backward_time=0.297, grad_norm=82.194, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.481e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:02:15,999 (trainer:732) INFO: 51epoch:train:8446-9008batch: iter_time=2.725e-04, forward_time=0.202, loss_att=39.118, acc=0.964, loss=39.118, backward_time=0.297, grad_norm=85.885, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.477e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:08:31,006 (trainer:732) INFO: 51epoch:train:9009-9571batch: iter_time=2.643e-04, forward_time=0.202, loss_att=38.672, acc=0.964, loss=38.672, backward_time=0.296, grad_norm=113.017, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.474e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:14:49,447 (trainer:732) INFO: 51epoch:train:9572-10134batch: iter_time=2.684e-04, forward_time=0.204, loss_att=38.953, acc=0.965, loss=38.953, backward_time=0.300, grad_norm=90.421, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.470e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:21:07,479 (trainer:732) INFO: 51epoch:train:10135-10697batch: iter_time=2.652e-04, forward_time=0.203, loss_att=38.583, acc=0.965, loss=38.583, backward_time=0.299, grad_norm=86.366, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.466e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:27:23,480 (trainer:732) INFO: 51epoch:train:10698-11260batch: iter_time=2.639e-04, forward_time=0.202, loss_att=37.892, acc=0.965, loss=37.892, backward_time=0.298, grad_norm=88.692, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.463e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:35:29,457 (trainer:338) INFO: 51epoch results: [train] iter_time=3.472e-04, forward_time=0.202, loss_att=38.071, acc=0.965, loss=38.071, backward_time=0.297, grad_norm=86.053, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.498e-04, train_time=2.725, time=2 hours, 8 minutes and 9.47 seconds, total_count=574872, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.717, acc=0.983, cer=0.021, wer=0.081, loss=10.717, time=4 minutes and 32.35 seconds, total_count=2856, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 16.34 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:35:33,363 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:35:33,382 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/43epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:35:33,383 (trainer:272) INFO: 52/60epoch started. Estimated time to finish: 20 hours, 26 minutes and 45.17 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:44:30,548 (trainer:732) INFO: 52epoch:train:1-563batch: iter_time=0.003, forward_time=0.202, loss_att=37.457, acc=0.965, loss=37.457, backward_time=0.297, grad_norm=82.278, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.459e-04, train_time=3.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:50:44,902 (trainer:732) INFO: 52epoch:train:564-1126batch: iter_time=2.573e-04, forward_time=0.202, loss_att=36.810, acc=0.966, loss=36.810, backward_time=0.296, grad_norm=81.180, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.455e-04, train_time=2.661 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 12:56:59,802 (trainer:732) INFO: 52epoch:train:1127-1689batch: iter_time=2.597e-04, forward_time=0.202, loss_att=37.262, acc=0.966, loss=37.262, backward_time=0.296, grad_norm=86.778, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.452e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:03:16,162 (trainer:732) INFO: 52epoch:train:1690-2252batch: iter_time=2.623e-04, forward_time=0.203, loss_att=38.083, acc=0.965, loss=38.083, backward_time=0.298, grad_norm=86.163, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.448e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:09:31,187 (trainer:732) INFO: 52epoch:train:2253-2815batch: iter_time=2.581e-04, forward_time=0.202, loss_att=37.205, acc=0.965, loss=37.205, backward_time=0.297, grad_norm=90.829, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.444e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:15:46,583 (trainer:732) INFO: 52epoch:train:2816-3378batch: iter_time=2.643e-04, forward_time=0.202, loss_att=37.334, acc=0.965, loss=37.334, backward_time=0.297, grad_norm=85.066, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.441e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:22:03,314 (trainer:732) INFO: 52epoch:train:3379-3941batch: iter_time=2.558e-04, forward_time=0.203, loss_att=38.072, acc=0.965, loss=38.072, backward_time=0.298, grad_norm=84.232, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.437e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:28:19,884 (trainer:732) INFO: 52epoch:train:3942-4504batch: iter_time=2.588e-04, forward_time=0.203, loss_att=37.574, acc=0.965, loss=37.574, backward_time=0.298, grad_norm=83.437, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.434e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:34:35,187 (trainer:732) INFO: 52epoch:train:4505-5067batch: iter_time=2.628e-04, forward_time=0.202, loss_att=38.448, acc=0.965, loss=38.448, backward_time=0.297, grad_norm=88.803, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.430e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:40:52,169 (trainer:732) INFO: 52epoch:train:5068-5630batch: iter_time=2.650e-04, forward_time=0.203, loss_att=38.791, acc=0.965, loss=38.791, backward_time=0.298, grad_norm=90.565, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.426e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:47:08,831 (trainer:732) INFO: 52epoch:train:5631-6193batch: iter_time=2.561e-04, forward_time=0.203, loss_att=38.364, acc=0.965, loss=38.364, backward_time=0.298, grad_norm=92.793, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.423e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:53:24,825 (trainer:732) INFO: 52epoch:train:6194-6756batch: iter_time=2.566e-04, forward_time=0.202, loss_att=37.279, acc=0.965, loss=37.279, backward_time=0.297, grad_norm=86.697, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.419e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 13:59:42,734 (trainer:732) INFO: 52epoch:train:6757-7319batch: iter_time=2.753e-04, forward_time=0.203, loss_att=38.790, acc=0.965, loss=38.790, backward_time=0.299, grad_norm=89.825, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.416e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:05:58,770 (trainer:732) INFO: 52epoch:train:7320-7882batch: iter_time=2.534e-04, forward_time=0.202, loss_att=38.242, acc=0.965, loss=38.242, backward_time=0.298, grad_norm=83.758, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.412e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:12:13,924 (trainer:732) INFO: 52epoch:train:7883-8445batch: iter_time=2.576e-04, forward_time=0.202, loss_att=36.818, acc=0.966, loss=36.818, backward_time=0.297, grad_norm=83.583, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.408e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:18:27,477 (trainer:732) INFO: 52epoch:train:8446-9008batch: iter_time=2.538e-04, forward_time=0.201, loss_att=37.343, acc=0.965, loss=37.343, backward_time=0.295, grad_norm=81.712, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.405e-04, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:24:43,177 (trainer:732) INFO: 52epoch:train:9009-9571batch: iter_time=2.540e-04, forward_time=0.202, loss_att=37.749, acc=0.965, loss=37.749, backward_time=0.297, grad_norm=82.110, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.401e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:30:55,997 (trainer:732) INFO: 52epoch:train:9572-10134batch: iter_time=2.617e-04, forward_time=0.201, loss_att=37.779, acc=0.964, loss=37.779, backward_time=0.295, grad_norm=82.488, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.398e-04, train_time=2.649 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:37:12,562 (trainer:732) INFO: 52epoch:train:10135-10697batch: iter_time=2.599e-04, forward_time=0.203, loss_att=38.474, acc=0.965, loss=38.474, backward_time=0.298, grad_norm=87.966, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.394e-04, train_time=2.675 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:43:28,073 (trainer:732) INFO: 52epoch:train:10698-11260batch: iter_time=2.576e-04, forward_time=0.202, loss_att=37.069, acc=0.966, loss=37.069, backward_time=0.297, grad_norm=82.609, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.391e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:51:34,785 (trainer:338) INFO: 52epoch results: [train] iter_time=3.846e-04, forward_time=0.202, loss_att=37.734, acc=0.965, loss=37.734, backward_time=0.297, grad_norm=85.614, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.425e-04, train_time=2.726, time=2 hours, 8 minutes and 14.22 seconds, total_count=586144, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.236, acc=0.983, cer=0.021, wer=0.079, loss=10.236, time=4 minutes and 31.57 seconds, total_count=2912, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.61 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:51:38,566 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:51:38,581 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/40epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 14:51:38,581 (trainer:272) INFO: 53/60epoch started. Estimated time to finish: 18 hours, 10 minutes and 24.8 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:00:36,776 (trainer:732) INFO: 53epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=37.407, acc=0.966, loss=37.407, backward_time=0.298, grad_norm=86.248, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.387e-04, train_time=3.832 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:06:52,299 (trainer:732) INFO: 53epoch:train:564-1126batch: iter_time=2.850e-04, forward_time=0.202, loss_att=37.471, acc=0.966, loss=37.471, backward_time=0.297, grad_norm=87.146, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.383e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:13:08,039 (trainer:732) INFO: 53epoch:train:1127-1689batch: iter_time=2.718e-04, forward_time=0.202, loss_att=36.746, acc=0.966, loss=36.746, backward_time=0.297, grad_norm=86.139, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.380e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:19:23,884 (trainer:732) INFO: 53epoch:train:1690-2252batch: iter_time=2.710e-04, forward_time=0.203, loss_att=37.289, acc=0.966, loss=37.289, backward_time=0.297, grad_norm=86.974, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.376e-04, train_time=2.669 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:25:38,662 (trainer:732) INFO: 53epoch:train:2253-2815batch: iter_time=2.747e-04, forward_time=0.202, loss_att=36.374, acc=0.966, loss=36.374, backward_time=0.297, grad_norm=87.264, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.373e-04, train_time=2.665 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:31:54,739 (trainer:732) INFO: 53epoch:train:2816-3378batch: iter_time=2.819e-04, forward_time=0.202, loss_att=37.205, acc=0.966, loss=37.205, backward_time=0.297, grad_norm=80.259, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.369e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:38:11,261 (trainer:732) INFO: 53epoch:train:3379-3941batch: iter_time=2.670e-04, forward_time=0.203, loss_att=36.240, acc=0.966, loss=36.240, backward_time=0.298, grad_norm=84.894, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.366e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:44:26,303 (trainer:732) INFO: 53epoch:train:3942-4504batch: iter_time=2.773e-04, forward_time=0.202, loss_att=37.528, acc=0.966, loss=37.528, backward_time=0.296, grad_norm=83.355, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.362e-04, train_time=2.664 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:50:40,384 (trainer:732) INFO: 53epoch:train:4505-5067batch: iter_time=2.687e-04, forward_time=0.201, loss_att=37.124, acc=0.965, loss=37.124, backward_time=0.296, grad_norm=83.451, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.359e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 15:56:56,901 (trainer:732) INFO: 53epoch:train:5068-5630batch: iter_time=2.765e-04, forward_time=0.203, loss_att=37.256, acc=0.966, loss=37.256, backward_time=0.298, grad_norm=84.230, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.355e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:03:13,379 (trainer:732) INFO: 53epoch:train:5631-6193batch: iter_time=2.707e-04, forward_time=0.202, loss_att=37.381, acc=0.966, loss=37.381, backward_time=0.298, grad_norm=86.508, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.352e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:09:28,221 (trainer:732) INFO: 53epoch:train:6194-6756batch: iter_time=2.635e-04, forward_time=0.202, loss_att=37.462, acc=0.965, loss=37.462, backward_time=0.297, grad_norm=92.704, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.348e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:15:43,525 (trainer:732) INFO: 53epoch:train:6757-7319batch: iter_time=2.700e-04, forward_time=0.202, loss_att=36.547, acc=0.966, loss=36.547, backward_time=0.297, grad_norm=82.711, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.345e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:21:59,843 (trainer:732) INFO: 53epoch:train:7320-7882batch: iter_time=2.735e-04, forward_time=0.203, loss_att=37.513, acc=0.966, loss=37.513, backward_time=0.298, grad_norm=85.250, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.341e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:28:15,763 (trainer:732) INFO: 53epoch:train:7883-8445batch: iter_time=2.646e-04, forward_time=0.202, loss_att=39.697, acc=0.964, loss=39.697, backward_time=0.297, grad_norm=85.716, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.338e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:34:33,835 (trainer:732) INFO: 53epoch:train:8446-9008batch: iter_time=2.707e-04, forward_time=0.204, loss_att=38.841, acc=0.965, loss=38.841, backward_time=0.300, grad_norm=83.213, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.334e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:40:50,068 (trainer:732) INFO: 53epoch:train:9009-9571batch: iter_time=2.682e-04, forward_time=0.202, loss_att=37.387, acc=0.966, loss=37.387, backward_time=0.298, grad_norm=86.042, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.331e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:47:05,254 (trainer:732) INFO: 53epoch:train:9572-10134batch: iter_time=2.668e-04, forward_time=0.202, loss_att=37.861, acc=0.965, loss=37.861, backward_time=0.297, grad_norm=89.465, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.327e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:53:18,343 (trainer:732) INFO: 53epoch:train:10135-10697batch: iter_time=2.696e-04, forward_time=0.201, loss_att=37.677, acc=0.965, loss=37.677, backward_time=0.295, grad_norm=93.516, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.324e-04, train_time=2.651 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 16:59:32,012 (trainer:732) INFO: 53epoch:train:10698-11260batch: iter_time=2.613e-04, forward_time=0.201, loss_att=37.094, acc=0.965, loss=37.094, backward_time=0.295, grad_norm=81.022, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.321e-04, train_time=2.653 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:07:35,316 (trainer:338) INFO: 53epoch results: [train] iter_time=3.757e-04, forward_time=0.202, loss_att=37.404, acc=0.965, loss=37.404, backward_time=0.297, grad_norm=85.790, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.354e-04, train_time=2.725, time=2 hours, 8 minutes and 9.61 seconds, total_count=597416, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.438, acc=0.983, cer=0.021, wer=0.080, loss=10.438, time=4 minutes and 31.35 seconds, total_count=2968, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 15.76 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:07:39,403 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:07:39,417 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/42epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:07:39,418 (trainer:272) INFO: 54/60epoch started. Estimated time to finish: 15 hours, 54 minutes and 4.42 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:16:37,308 (trainer:732) INFO: 54epoch:train:1-563batch: iter_time=0.002, forward_time=0.202, loss_att=36.277, acc=0.966, loss=36.277, backward_time=0.296, grad_norm=81.906, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.317e-04, train_time=3.828 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:22:54,208 (trainer:732) INFO: 54epoch:train:564-1126batch: iter_time=2.645e-04, forward_time=0.203, loss_att=37.068, acc=0.966, loss=37.068, backward_time=0.298, grad_norm=85.688, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.314e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:29:09,589 (trainer:732) INFO: 54epoch:train:1127-1689batch: iter_time=2.682e-04, forward_time=0.203, loss_att=36.409, acc=0.966, loss=36.409, backward_time=0.298, grad_norm=83.211, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.310e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:35:25,750 (trainer:732) INFO: 54epoch:train:1690-2252batch: iter_time=2.615e-04, forward_time=0.202, loss_att=37.848, acc=0.965, loss=37.848, backward_time=0.297, grad_norm=82.251, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.307e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:41:39,229 (trainer:732) INFO: 54epoch:train:2253-2815batch: iter_time=2.613e-04, forward_time=0.201, loss_att=35.800, acc=0.966, loss=35.800, backward_time=0.295, grad_norm=92.092, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.303e-04, train_time=2.655 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<16796> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:47:55,199 (trainer:732) INFO: 54epoch:train:2816-3378batch: iter_time=2.617e-04, forward_time=0.202, loss_att=35.487, acc=0.966, loss=35.487, backward_time=0.297, grad_norm=85.719, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.300e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 17:54:13,868 (trainer:732) INFO: 54epoch:train:3379-3941batch: iter_time=2.626e-04, forward_time=0.204, loss_att=37.390, acc=0.966, loss=37.390, backward_time=0.300, grad_norm=83.319, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.296e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:00:29,887 (trainer:732) INFO: 54epoch:train:3942-4504batch: iter_time=2.613e-04, forward_time=0.202, loss_att=37.981, acc=0.965, loss=37.981, backward_time=0.297, grad_norm=92.695, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.293e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:06:43,951 (trainer:732) INFO: 54epoch:train:4505-5067batch: iter_time=2.597e-04, forward_time=0.201, loss_att=36.863, acc=0.966, loss=36.863, backward_time=0.296, grad_norm=82.204, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.290e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:13:00,046 (trainer:732) INFO: 54epoch:train:5068-5630batch: iter_time=2.593e-04, forward_time=0.202, loss_att=37.693, acc=0.965, loss=37.693, backward_time=0.298, grad_norm=87.161, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.286e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:19:16,604 (trainer:732) INFO: 54epoch:train:5631-6193batch: iter_time=2.528e-04, forward_time=0.203, loss_att=37.920, acc=0.965, loss=37.920, backward_time=0.298, grad_norm=84.423, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.283e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:25:32,268 (trainer:732) INFO: 54epoch:train:6194-6756batch: iter_time=2.636e-04, forward_time=0.202, loss_att=37.524, acc=0.965, loss=37.524, backward_time=0.297, grad_norm=85.631, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.279e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:31:47,487 (trainer:732) INFO: 54epoch:train:6757-7319batch: iter_time=2.628e-04, forward_time=0.202, loss_att=36.631, acc=0.966, loss=36.631, backward_time=0.297, grad_norm=88.218, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.276e-04, train_time=2.667 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:38:02,220 (trainer:732) INFO: 54epoch:train:7320-7882batch: iter_time=2.632e-04, forward_time=0.202, loss_att=36.253, acc=0.966, loss=36.253, backward_time=0.297, grad_norm=83.755, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.273e-04, train_time=2.662 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:44:17,557 (trainer:732) INFO: 54epoch:train:7883-8445batch: iter_time=2.593e-04, forward_time=0.202, loss_att=37.768, acc=0.965, loss=37.768, backward_time=0.297, grad_norm=81.082, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.269e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:50:31,749 (trainer:732) INFO: 54epoch:train:8446-9008batch: iter_time=2.607e-04, forward_time=0.202, loss_att=37.728, acc=0.965, loss=37.728, backward_time=0.296, grad_norm=86.481, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.266e-04, train_time=2.658 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 18:56:48,095 (trainer:732) INFO: 54epoch:train:9009-9571batch: iter_time=2.561e-04, forward_time=0.202, loss_att=37.250, acc=0.966, loss=37.250, backward_time=0.298, grad_norm=84.537, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.262e-04, train_time=2.675 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849198:1849255 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:03:04,568 (trainer:732) INFO: 54epoch:train:9572-10134batch: iter_time=2.560e-04, forward_time=0.203, loss_att=37.731, acc=0.966, loss=37.731, backward_time=0.298, grad_norm=84.474, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.259e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:09:20,004 (trainer:732) INFO: 54epoch:train:10135-10697batch: iter_time=2.523e-04, forward_time=0.202, loss_att=37.602, acc=0.966, loss=37.602, backward_time=0.297, grad_norm=80.700, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.256e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:15:34,744 (trainer:732) INFO: 54epoch:train:10698-11260batch: iter_time=2.505e-04, forward_time=0.201, loss_att=37.355, acc=0.966, loss=37.355, backward_time=0.296, grad_norm=83.035, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.252e-04, train_time=2.661 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 92) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849195:1849254 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 92) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:23:40,676 (trainer:338) INFO: 54epoch results: [train] iter_time=3.487e-04, forward_time=0.202, loss_att=37.125, acc=0.966, loss=37.125, backward_time=0.297, grad_norm=84.905, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.285e-04, train_time=2.726, time=2 hours, 8 minutes and 14.15 seconds, total_count=608688, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.476, acc=0.984, cer=0.021, wer=0.080, loss=10.476, time=4 minutes and 30.55 seconds, total_count=3024, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 16.55 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:23:44,653 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:23:44,674 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/46epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:23:44,674 (trainer:272) INFO: 55/60epoch started. Estimated time to finish: 13 hours, 37 minutes and 45.25 seconds + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:32:42,969 (trainer:732) INFO: 55epoch:train:1-563batch: iter_time=0.002, forward_time=0.203, loss_att=36.333, acc=0.967, loss=36.333, backward_time=0.299, grad_norm=81.343, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.249e-04, train_time=3.830 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:38:59,708 (trainer:732) INFO: 55epoch:train:564-1126batch: iter_time=2.718e-04, forward_time=0.203, loss_att=36.652, acc=0.966, loss=36.652, backward_time=0.299, grad_norm=95.710, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.246e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.245<35797> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849196:1849253 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:45:14,435 (trainer:732) INFO: 55epoch:train:1127-1689batch: iter_time=2.757e-04, forward_time=0.202, loss_att=35.673, acc=0.967, loss=35.673, backward_time=0.296, grad_norm=87.633, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.242e-04, train_time=2.663 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:51:30,472 (trainer:732) INFO: 55epoch:train:1690-2252batch: iter_time=2.701e-04, forward_time=0.202, loss_att=36.674, acc=0.966, loss=36.674, backward_time=0.298, grad_norm=87.381, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.239e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 19:57:48,474 (trainer:732) INFO: 55epoch:train:2253-2815batch: iter_time=2.637e-04, forward_time=0.203, loss_att=37.302, acc=0.967, loss=37.302, backward_time=0.299, grad_norm=80.036, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.236e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:04:03,110 (trainer:732) INFO: 55epoch:train:2816-3378batch: iter_time=2.692e-04, forward_time=0.202, loss_att=37.343, acc=0.966, loss=37.343, backward_time=0.297, grad_norm=83.575, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.232e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:10:19,023 (trainer:732) INFO: 55epoch:train:3379-3941batch: iter_time=2.667e-04, forward_time=0.202, loss_att=36.267, acc=0.966, loss=36.267, backward_time=0.297, grad_norm=86.570, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.229e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:16:33,165 (trainer:732) INFO: 55epoch:train:3942-4504batch: iter_time=2.674e-04, forward_time=0.201, loss_att=36.454, acc=0.965, loss=36.454, backward_time=0.296, grad_norm=79.895, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.226e-04, train_time=2.657 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:22:47,464 (trainer:732) INFO: 55epoch:train:4505-5067batch: iter_time=2.584e-04, forward_time=0.201, loss_att=35.636, acc=0.967, loss=35.636, backward_time=0.296, grad_norm=80.157, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.222e-04, train_time=2.660 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:29:03,447 (trainer:732) INFO: 55epoch:train:5068-5630batch: iter_time=2.632e-04, forward_time=0.202, loss_att=37.456, acc=0.966, loss=37.456, backward_time=0.297, grad_norm=81.293, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.219e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:35:19,569 (trainer:732) INFO: 55epoch:train:5631-6193batch: iter_time=2.685e-04, forward_time=0.202, loss_att=36.753, acc=0.966, loss=36.753, backward_time=0.297, grad_norm=86.339, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.216e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:41:34,321 (trainer:732) INFO: 55epoch:train:6194-6756batch: iter_time=2.651e-04, forward_time=0.201, loss_att=36.697, acc=0.966, loss=36.697, backward_time=0.296, grad_norm=87.156, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.212e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:47:48,909 (trainer:732) INFO: 55epoch:train:6757-7319batch: iter_time=2.673e-04, forward_time=0.201, loss_att=37.094, acc=0.965, loss=37.094, backward_time=0.296, grad_norm=83.794, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.209e-04, train_time=2.662 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 20:54:05,635 (trainer:732) INFO: 55epoch:train:7320-7882batch: iter_time=2.738e-04, forward_time=0.202, loss_att=37.315, acc=0.966, loss=37.315, backward_time=0.298, grad_norm=86.606, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.206e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:00:22,295 (trainer:732) INFO: 55epoch:train:7883-8445batch: iter_time=2.786e-04, forward_time=0.203, loss_att=38.077, acc=0.965, loss=38.077, backward_time=0.298, grad_norm=82.494, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.202e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:06:35,938 (trainer:732) INFO: 55epoch:train:8446-9008batch: iter_time=2.547e-04, forward_time=0.202, loss_att=37.620, acc=0.966, loss=37.620, backward_time=0.295, grad_norm=96.115, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.199e-04, train_time=2.654 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:12:49,135 (trainer:732) INFO: 55epoch:train:9009-9571batch: iter_time=2.510e-04, forward_time=0.202, loss_att=36.586, acc=0.966, loss=36.586, backward_time=0.294, grad_norm=80.859, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.196e-04, train_time=2.652 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:19:09,422 (trainer:732) INFO: 55epoch:train:9572-10134batch: iter_time=3.290e-04, forward_time=0.205, loss_att=36.657, acc=0.966, loss=36.657, backward_time=0.299, grad_norm=80.467, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.193e-04, train_time=2.701 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:25:29,146 (trainer:732) INFO: 55epoch:train:10135-10697batch: iter_time=3.415e-04, forward_time=0.205, loss_att=37.194, acc=0.966, loss=37.194, backward_time=0.299, grad_norm=84.959, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.189e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:31:46,231 (trainer:732) INFO: 55epoch:train:10698-11260batch: iter_time=3.290e-04, forward_time=0.203, loss_att=36.865, acc=0.965, loss=36.865, backward_time=0.296, grad_norm=83.192, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.186e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:40:37,871 (trainer:338) INFO: 55epoch results: [train] iter_time=3.720e-04, forward_time=0.203, loss_att=36.830, acc=0.966, loss=36.830, backward_time=0.297, grad_norm=84.767, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.217e-04, train_time=2.728, time=2 hours, 8 minutes and 20.81 seconds, total_count=619960, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.484, acc=0.983, cer=0.021, wer=0.079, loss=10.484, time=4 minutes and 57.35 seconds, total_count=3080, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 35.02 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:40:42,129 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:40:42,145 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/51epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:40:42,146 (trainer:272) INFO: 56/60epoch started. Estimated time to finish: 11 hours, 21 minutes and 31.34 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:50:06,645 (trainer:732) INFO: 56epoch:train:1-563batch: iter_time=0.003, forward_time=0.206, loss_att=34.981, acc=0.968, loss=34.981, backward_time=0.300, grad_norm=82.729, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.183e-04, train_time=4.018 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 21:56:24,497 (trainer:732) INFO: 56epoch:train:564-1126batch: iter_time=3.075e-04, forward_time=0.204, loss_att=35.798, acc=0.967, loss=35.798, backward_time=0.298, grad_norm=85.428, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.180e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:02:45,768 (trainer:732) INFO: 56epoch:train:1127-1689batch: iter_time=3.178e-04, forward_time=0.205, loss_att=36.776, acc=0.967, loss=36.776, backward_time=0.300, grad_norm=88.473, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.176e-04, train_time=2.708 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:09:04,410 (trainer:732) INFO: 56epoch:train:1690-2252batch: iter_time=3.157e-04, forward_time=0.204, loss_att=35.252, acc=0.967, loss=35.252, backward_time=0.298, grad_norm=81.581, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.173e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:15:25,320 (trainer:732) INFO: 56epoch:train:2253-2815batch: iter_time=3.056e-04, forward_time=0.206, loss_att=36.851, acc=0.966, loss=36.851, backward_time=0.300, grad_norm=87.392, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.170e-04, train_time=2.708 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:21:43,171 (trainer:732) INFO: 56epoch:train:2816-3378batch: iter_time=3.077e-04, forward_time=0.203, loss_att=36.697, acc=0.966, loss=36.697, backward_time=0.297, grad_norm=79.806, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.167e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:28:04,562 (trainer:732) INFO: 56epoch:train:3379-3941batch: iter_time=3.053e-04, forward_time=0.206, loss_att=36.900, acc=0.966, loss=36.900, backward_time=0.301, grad_norm=90.336, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.163e-04, train_time=2.708 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:34:23,020 (trainer:732) INFO: 56epoch:train:3942-4504batch: iter_time=3.130e-04, forward_time=0.204, loss_att=35.602, acc=0.967, loss=35.602, backward_time=0.298, grad_norm=85.091, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.160e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:40:42,363 (trainer:732) INFO: 56epoch:train:4505-5067batch: iter_time=3.159e-04, forward_time=0.205, loss_att=36.408, acc=0.966, loss=36.408, backward_time=0.299, grad_norm=81.921, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.157e-04, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:47:01,509 (trainer:732) INFO: 56epoch:train:5068-5630batch: iter_time=3.236e-04, forward_time=0.204, loss_att=35.820, acc=0.967, loss=35.820, backward_time=0.298, grad_norm=82.552, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.154e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:53:20,856 (trainer:732) INFO: 56epoch:train:5631-6193batch: iter_time=3.144e-04, forward_time=0.204, loss_att=36.405, acc=0.966, loss=36.405, backward_time=0.298, grad_norm=95.105, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.150e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 22:59:41,555 (trainer:732) INFO: 56epoch:train:6194-6756batch: iter_time=3.341e-04, forward_time=0.206, loss_att=37.083, acc=0.966, loss=37.083, backward_time=0.300, grad_norm=89.503, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.147e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:06:01,936 (trainer:732) INFO: 56epoch:train:6757-7319batch: iter_time=3.146e-04, forward_time=0.205, loss_att=37.372, acc=0.966, loss=37.372, backward_time=0.300, grad_norm=83.835, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.144e-04, train_time=2.702 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:12:20,069 (trainer:732) INFO: 56epoch:train:7320-7882batch: iter_time=3.082e-04, forward_time=0.204, loss_att=35.718, acc=0.967, loss=35.718, backward_time=0.297, grad_norm=82.523, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.141e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:18:38,667 (trainer:732) INFO: 56epoch:train:7883-8445batch: iter_time=3.215e-04, forward_time=0.204, loss_att=36.475, acc=0.966, loss=36.475, backward_time=0.298, grad_norm=81.516, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.138e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:24:59,262 (trainer:732) INFO: 56epoch:train:8446-9008batch: iter_time=3.112e-04, forward_time=0.205, loss_att=36.722, acc=0.966, loss=36.722, backward_time=0.299, grad_norm=81.950, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.134e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:31:18,328 (trainer:732) INFO: 56epoch:train:9009-9571batch: iter_time=3.201e-04, forward_time=0.204, loss_att=37.218, acc=0.965, loss=37.218, backward_time=0.299, grad_norm=83.256, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.131e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:37:36,915 (trainer:732) INFO: 56epoch:train:9572-10134batch: iter_time=3.255e-04, forward_time=0.204, loss_att=37.554, acc=0.965, loss=37.554, backward_time=0.298, grad_norm=81.498, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.128e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:43:55,826 (trainer:732) INFO: 56epoch:train:10135-10697batch: iter_time=3.131e-04, forward_time=0.204, loss_att=37.296, acc=0.965, loss=37.296, backward_time=0.298, grad_norm=84.369, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.125e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:50:17,446 (trainer:732) INFO: 56epoch:train:10698-11260batch: iter_time=3.049e-04, forward_time=0.206, loss_att=36.948, acc=0.967, loss=36.948, backward_time=0.300, grad_norm=86.994, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.122e-04, train_time=2.710 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:59:24,504 (trainer:338) INFO: 56epoch results: [train] iter_time=4.407e-04, forward_time=0.205, loss_att=36.492, acc=0.966, loss=36.492, backward_time=0.299, grad_norm=84.815, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.152e-04, train_time=2.762, time=2 hours, 9 minutes and 59.76 seconds, total_count=631232, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.504, acc=0.983, cer=0.021, wer=0.080, loss=10.504, time=5 minutes and 11.21 seconds, total_count=3136, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 31.38 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:59:29,122 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:59:29,139 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/44epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-26 23:59:29,140 (trainer:272) INFO: 57/60epoch started. Estimated time to finish: 9 hours, 5 minutes and 23.69 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:08:46,506 (trainer:732) INFO: 57epoch:train:1-563batch: iter_time=0.002, forward_time=0.204, loss_att=35.506, acc=0.967, loss=35.506, backward_time=0.297, grad_norm=86.497, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.118e-04, train_time=3.968 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:15:08,016 (trainer:732) INFO: 57epoch:train:564-1126batch: iter_time=3.317e-04, forward_time=0.205, loss_att=36.538, acc=0.967, loss=36.538, backward_time=0.300, grad_norm=85.961, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.115e-04, train_time=2.710 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:21:27,475 (trainer:732) INFO: 57epoch:train:1127-1689batch: iter_time=3.299e-04, forward_time=0.205, loss_att=36.103, acc=0.966, loss=36.103, backward_time=0.299, grad_norm=86.019, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.112e-04, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:27:48,744 (trainer:732) INFO: 57epoch:train:1690-2252batch: iter_time=3.482e-04, forward_time=0.206, loss_att=35.974, acc=0.968, loss=35.974, backward_time=0.300, grad_norm=83.159, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.109e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:34:09,296 (trainer:732) INFO: 57epoch:train:2253-2815batch: iter_time=3.278e-04, forward_time=0.206, loss_att=35.917, acc=0.967, loss=35.917, backward_time=0.300, grad_norm=78.763, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.106e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:40:29,856 (trainer:732) INFO: 57epoch:train:2816-3378batch: iter_time=3.416e-04, forward_time=0.205, loss_att=35.933, acc=0.967, loss=35.933, backward_time=0.300, grad_norm=89.774, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.103e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:46:48,938 (trainer:732) INFO: 57epoch:train:3379-3941batch: iter_time=3.344e-04, forward_time=0.204, loss_att=36.392, acc=0.966, loss=36.392, backward_time=0.298, grad_norm=100.789, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.099e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:53:09,206 (trainer:732) INFO: 57epoch:train:3942-4504batch: iter_time=3.354e-04, forward_time=0.205, loss_att=36.720, acc=0.966, loss=36.720, backward_time=0.299, grad_norm=86.062, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.096e-04, train_time=2.701 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 00:59:31,751 (trainer:732) INFO: 57epoch:train:4505-5067batch: iter_time=3.337e-04, forward_time=0.206, loss_att=36.562, acc=0.967, loss=36.562, backward_time=0.302, grad_norm=94.283, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.093e-04, train_time=2.718 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:05:48,825 (trainer:732) INFO: 57epoch:train:5068-5630batch: iter_time=3.331e-04, forward_time=0.204, loss_att=35.434, acc=0.967, loss=35.434, backward_time=0.297, grad_norm=82.098, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.090e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:12:08,048 (trainer:732) INFO: 57epoch:train:5631-6193batch: iter_time=3.384e-04, forward_time=0.204, loss_att=37.701, acc=0.966, loss=37.701, backward_time=0.298, grad_norm=87.330, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.087e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:18:29,099 (trainer:732) INFO: 57epoch:train:6194-6756batch: iter_time=3.448e-04, forward_time=0.206, loss_att=36.922, acc=0.966, loss=36.922, backward_time=0.300, grad_norm=87.133, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.084e-04, train_time=2.706 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:24:48,210 (trainer:732) INFO: 57epoch:train:6757-7319batch: iter_time=3.322e-04, forward_time=0.205, loss_att=36.441, acc=0.967, loss=36.441, backward_time=0.299, grad_norm=83.665, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.081e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:31:06,446 (trainer:732) INFO: 57epoch:train:7320-7882batch: iter_time=3.312e-04, forward_time=0.204, loss_att=36.056, acc=0.966, loss=36.056, backward_time=0.298, grad_norm=84.141, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.078e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:37:24,887 (trainer:732) INFO: 57epoch:train:7883-8445batch: iter_time=3.298e-04, forward_time=0.204, loss_att=36.011, acc=0.966, loss=36.011, backward_time=0.298, grad_norm=78.781, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.074e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:43:45,481 (trainer:732) INFO: 57epoch:train:8446-9008batch: iter_time=3.348e-04, forward_time=0.205, loss_att=36.542, acc=0.967, loss=36.542, backward_time=0.300, grad_norm=83.477, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.071e-04, train_time=2.703 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:50:04,121 (trainer:732) INFO: 57epoch:train:9009-9571batch: iter_time=3.375e-04, forward_time=0.204, loss_att=36.032, acc=0.966, loss=36.032, backward_time=0.298, grad_norm=79.410, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.068e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 01:56:22,213 (trainer:732) INFO: 57epoch:train:9572-10134batch: iter_time=3.318e-04, forward_time=0.204, loss_att=36.521, acc=0.966, loss=36.521, backward_time=0.298, grad_norm=83.586, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.065e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:02:42,072 (trainer:732) INFO: 57epoch:train:10135-10697batch: iter_time=3.401e-04, forward_time=0.205, loss_att=36.136, acc=0.967, loss=36.136, backward_time=0.299, grad_norm=103.706, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.062e-04, train_time=2.699 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:08:58,751 (trainer:732) INFO: 57epoch:train:10698-11260batch: iter_time=3.256e-04, forward_time=0.203, loss_att=35.605, acc=0.966, loss=35.605, backward_time=0.296, grad_norm=80.516, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.059e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:18:09,665 (trainer:338) INFO: 57epoch results: [train] iter_time=4.212e-04, forward_time=0.205, loss_att=36.257, acc=0.967, loss=36.257, backward_time=0.299, grad_norm=86.256, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.088e-04, train_time=2.760, time=2 hours, 9 minutes and 53.39 seconds, total_count=642504, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.476, acc=0.983, cer=0.021, wer=0.079, loss=10.476, time=5 minutes and 14.89 seconds, total_count=3192, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 32.24 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:18:14,189 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:18:14,208 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/47epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:18:14,209 (trainer:272) INFO: 58/60epoch started. Estimated time to finish: 6 hours, 49 minutes and 10.36 seconds + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:27:32,683 (trainer:732) INFO: 58epoch:train:1-563batch: iter_time=0.002, forward_time=0.205, loss_att=34.384, acc=0.968, loss=34.384, backward_time=0.298, grad_norm=85.974, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.056e-04, train_time=3.975 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:33:52,948 (trainer:732) INFO: 58epoch:train:564-1126batch: iter_time=3.385e-04, forward_time=0.206, loss_att=35.588, acc=0.967, loss=35.588, backward_time=0.300, grad_norm=87.097, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.053e-04, train_time=2.702 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:40:15,328 (trainer:732) INFO: 58epoch:train:1127-1689batch: iter_time=3.327e-04, forward_time=0.206, loss_att=36.530, acc=0.967, loss=36.530, backward_time=0.301, grad_norm=89.632, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.050e-04, train_time=2.716 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:46:35,466 (trainer:732) INFO: 58epoch:train:1690-2252batch: iter_time=3.339e-04, forward_time=0.205, loss_att=35.106, acc=0.968, loss=35.106, backward_time=0.299, grad_norm=88.119, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.046e-04, train_time=2.700 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:52:54,222 (trainer:732) INFO: 58epoch:train:2253-2815batch: iter_time=3.387e-04, forward_time=0.204, loss_att=35.365, acc=0.967, loss=35.365, backward_time=0.298, grad_norm=84.422, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.043e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 02:59:11,891 (trainer:732) INFO: 58epoch:train:2816-3378batch: iter_time=3.322e-04, forward_time=0.204, loss_att=36.024, acc=0.966, loss=36.024, backward_time=0.297, grad_norm=86.372, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.040e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:05:32,442 (trainer:732) INFO: 58epoch:train:3379-3941batch: iter_time=3.440e-04, forward_time=0.205, loss_att=36.086, acc=0.967, loss=36.086, backward_time=0.300, grad_norm=81.009, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.037e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:11:52,357 (trainer:732) INFO: 58epoch:train:3942-4504batch: iter_time=3.342e-04, forward_time=0.205, loss_att=35.511, acc=0.967, loss=35.511, backward_time=0.299, grad_norm=91.048, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=7.034e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:18:14,343 (trainer:732) INFO: 58epoch:train:4505-5067batch: iter_time=3.420e-04, forward_time=0.207, loss_att=36.301, acc=0.967, loss=36.301, backward_time=0.301, grad_norm=86.387, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.031e-04, train_time=2.716 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:24:32,073 (trainer:732) INFO: 58epoch:train:5068-5630batch: iter_time=3.388e-04, forward_time=0.204, loss_att=35.664, acc=0.966, loss=35.664, backward_time=0.297, grad_norm=79.171, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.028e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:30:51,962 (trainer:732) INFO: 58epoch:train:5631-6193batch: iter_time=3.297e-04, forward_time=0.205, loss_att=36.336, acc=0.967, loss=36.336, backward_time=0.298, grad_norm=81.211, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.025e-04, train_time=2.700 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:37:13,488 (trainer:732) INFO: 58epoch:train:6194-6756batch: iter_time=3.228e-04, forward_time=0.206, loss_att=35.749, acc=0.967, loss=35.749, backward_time=0.300, grad_norm=86.077, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.022e-04, train_time=2.709 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:43:35,541 (trainer:732) INFO: 58epoch:train:6757-7319batch: iter_time=3.361e-04, forward_time=0.206, loss_att=37.941, acc=0.966, loss=37.941, backward_time=0.301, grad_norm=84.108, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.019e-04, train_time=2.715 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:49:53,844 (trainer:732) INFO: 58epoch:train:7320-7882batch: iter_time=3.386e-04, forward_time=0.204, loss_att=35.638, acc=0.967, loss=35.638, backward_time=0.298, grad_norm=86.767, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.016e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 03:56:14,391 (trainer:732) INFO: 58epoch:train:7883-8445batch: iter_time=3.358e-04, forward_time=0.205, loss_att=36.494, acc=0.967, loss=36.494, backward_time=0.300, grad_norm=82.228, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.013e-04, train_time=2.702 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:02:30,758 (trainer:732) INFO: 58epoch:train:8446-9008batch: iter_time=3.350e-04, forward_time=0.203, loss_att=36.521, acc=0.965, loss=36.521, backward_time=0.295, grad_norm=82.725, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.010e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:08:49,629 (trainer:732) INFO: 58epoch:train:9009-9571batch: iter_time=3.357e-04, forward_time=0.204, loss_att=35.714, acc=0.967, loss=35.714, backward_time=0.298, grad_norm=89.342, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.007e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:15:09,448 (trainer:732) INFO: 58epoch:train:9572-10134batch: iter_time=3.383e-04, forward_time=0.205, loss_att=36.277, acc=0.966, loss=36.277, backward_time=0.299, grad_norm=79.828, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.004e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:21:27,726 (trainer:732) INFO: 58epoch:train:10135-10697batch: iter_time=3.277e-04, forward_time=0.204, loss_att=35.746, acc=0.966, loss=35.746, backward_time=0.298, grad_norm=86.383, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.001e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:27:47,346 (trainer:732) INFO: 58epoch:train:10698-11260batch: iter_time=3.344e-04, forward_time=0.205, loss_att=35.874, acc=0.967, loss=35.874, backward_time=0.298, grad_norm=88.987, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.998e-04, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:37:09,905 (trainer:338) INFO: 58epoch results: [train] iter_time=4.322e-04, forward_time=0.205, loss_att=35.934, acc=0.967, loss=35.934, backward_time=0.299, grad_norm=85.337, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=7.027e-04, train_time=2.761, time=2 hours, 9 minutes and 57.81 seconds, total_count=653776, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.328, acc=0.984, cer=0.020, wer=0.078, loss=10.328, time=5 minutes and 22.73 seconds, total_count=3248, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 35.16 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:37:14,408 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:37:14,423 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/50epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:37:14,424 (trainer:272) INFO: 59/60epoch started. Estimated time to finish: 4 hours, 32 minutes and 52.31 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:46:33,871 (trainer:732) INFO: 59epoch:train:1-563batch: iter_time=0.003, forward_time=0.205, loss_att=34.533, acc=0.968, loss=34.533, backward_time=0.299, grad_norm=83.433, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.995e-04, train_time=3.981 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:52:53,479 (trainer:732) INFO: 59epoch:train:564-1126batch: iter_time=3.278e-04, forward_time=0.205, loss_att=34.593, acc=0.968, loss=34.593, backward_time=0.299, grad_norm=75.962, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.992e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 04:59:13,711 (trainer:732) INFO: 59epoch:train:1127-1689batch: iter_time=3.393e-04, forward_time=0.205, loss_att=36.961, acc=0.967, loss=36.961, backward_time=0.300, grad_norm=93.406, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.989e-04, train_time=2.701 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:05:33,700 (trainer:732) INFO: 59epoch:train:1690-2252batch: iter_time=3.445e-04, forward_time=0.205, loss_att=36.176, acc=0.967, loss=36.176, backward_time=0.300, grad_norm=87.229, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.986e-04, train_time=2.699 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:11:54,860 (trainer:732) INFO: 59epoch:train:2253-2815batch: iter_time=3.188e-04, forward_time=0.205, loss_att=35.787, acc=0.968, loss=35.787, backward_time=0.300, grad_norm=84.079, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.983e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:18:12,522 (trainer:732) INFO: 59epoch:train:2816-3378batch: iter_time=3.289e-04, forward_time=0.204, loss_att=34.503, acc=0.967, loss=34.503, backward_time=0.297, grad_norm=79.277, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.980e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:24:34,313 (trainer:732) INFO: 59epoch:train:3379-3941batch: iter_time=3.380e-04, forward_time=0.206, loss_att=35.594, acc=0.967, loss=35.594, backward_time=0.300, grad_norm=91.087, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.977e-04, train_time=2.711 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:30:52,624 (trainer:732) INFO: 59epoch:train:3942-4504batch: iter_time=3.270e-04, forward_time=0.204, loss_att=35.135, acc=0.967, loss=35.135, backward_time=0.298, grad_norm=88.735, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.974e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:37:14,380 (trainer:732) INFO: 59epoch:train:4505-5067batch: iter_time=3.338e-04, forward_time=0.206, loss_att=36.674, acc=0.966, loss=36.674, backward_time=0.301, grad_norm=95.993, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.971e-04, train_time=2.713 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:43:31,851 (trainer:732) INFO: 59epoch:train:5068-5630batch: iter_time=3.240e-04, forward_time=0.204, loss_att=35.950, acc=0.966, loss=35.950, backward_time=0.297, grad_norm=87.647, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.968e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:49:51,552 (trainer:732) INFO: 59epoch:train:5631-6193batch: iter_time=3.402e-04, forward_time=0.205, loss_att=35.969, acc=0.967, loss=35.969, backward_time=0.298, grad_norm=80.371, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.965e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 05:56:11,836 (trainer:732) INFO: 59epoch:train:6194-6756batch: iter_time=3.285e-04, forward_time=0.206, loss_att=35.768, acc=0.967, loss=35.768, backward_time=0.299, grad_norm=80.926, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.962e-04, train_time=2.700 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:02:31,561 (trainer:732) INFO: 59epoch:train:6757-7319batch: iter_time=3.339e-04, forward_time=0.205, loss_att=35.268, acc=0.967, loss=35.268, backward_time=0.299, grad_norm=90.832, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.959e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:08:51,127 (trainer:732) INFO: 59epoch:train:7320-7882batch: iter_time=3.247e-04, forward_time=0.205, loss_att=35.748, acc=0.967, loss=35.748, backward_time=0.298, grad_norm=81.990, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.956e-04, train_time=2.697 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:15:08,506 (trainer:732) INFO: 59epoch:train:7883-8445batch: iter_time=3.293e-04, forward_time=0.204, loss_att=35.247, acc=0.967, loss=35.247, backward_time=0.298, grad_norm=77.837, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.953e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:21:30,737 (trainer:732) INFO: 59epoch:train:8446-9008batch: iter_time=3.202e-04, forward_time=0.206, loss_att=36.425, acc=0.967, loss=36.425, backward_time=0.301, grad_norm=85.616, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.950e-04, train_time=2.714 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:27:51,597 (trainer:732) INFO: 59epoch:train:9009-9571batch: iter_time=3.267e-04, forward_time=0.205, loss_att=37.254, acc=0.967, loss=37.254, backward_time=0.300, grad_norm=87.092, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.947e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:34:09,925 (trainer:732) INFO: 59epoch:train:9572-10134batch: iter_time=3.197e-04, forward_time=0.205, loss_att=35.656, acc=0.967, loss=35.656, backward_time=0.298, grad_norm=83.615, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.944e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:40:28,142 (trainer:732) INFO: 59epoch:train:10135-10697batch: iter_time=3.311e-04, forward_time=0.204, loss_att=35.024, acc=0.967, loss=35.024, backward_time=0.298, grad_norm=79.170, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.941e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:46:44,989 (trainer:732) INFO: 59epoch:train:10698-11260batch: iter_time=3.174e-04, forward_time=0.203, loss_att=35.705, acc=0.967, loss=35.705, backward_time=0.296, grad_norm=88.907, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.938e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:55:39,858 (trainer:338) INFO: 59epoch results: [train] iter_time=4.499e-04, forward_time=0.205, loss_att=35.690, acc=0.967, loss=35.690, backward_time=0.299, grad_norm=85.157, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.966e-04, train_time=2.760, time=2 hours, 9 minutes and 52.6 seconds, total_count=665048, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.403, acc=0.984, cer=0.021, wer=0.078, loss=10.403, time=5 minutes and 1.93 seconds, total_count=3304, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 30.9 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:55:44,232 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:55:44,252 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/49epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 06:55:44,253 (trainer:272) INFO: 60/60epoch started. Estimated time to finish: 2 hours, 16 minutes and 28.25 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:05:05,876 (trainer:732) INFO: 60epoch:train:1-563batch: iter_time=0.002, forward_time=0.205, loss_att=34.675, acc=0.968, loss=34.675, backward_time=0.300, grad_norm=87.412, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.935e-04, train_time=3.997 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:11:26,042 (trainer:732) INFO: 60epoch:train:564-1126batch: iter_time=3.367e-04, forward_time=0.205, loss_att=35.587, acc=0.968, loss=35.587, backward_time=0.300, grad_norm=85.930, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.932e-04, train_time=2.701 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:17:46,774 (trainer:732) INFO: 60epoch:train:1127-1689batch: iter_time=3.268e-04, forward_time=0.205, loss_att=34.320, acc=0.968, loss=34.320, backward_time=0.300, grad_norm=90.840, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.929e-04, train_time=2.704 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:24:04,303 (trainer:732) INFO: 60epoch:train:1690-2252batch: iter_time=3.233e-04, forward_time=0.204, loss_att=34.903, acc=0.968, loss=34.903, backward_time=0.297, grad_norm=81.663, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.926e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:30:23,950 (trainer:732) INFO: 60epoch:train:2253-2815batch: iter_time=3.224e-04, forward_time=0.205, loss_att=36.524, acc=0.967, loss=36.524, backward_time=0.299, grad_norm=79.628, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.923e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:36:43,140 (trainer:732) INFO: 60epoch:train:2816-3378batch: iter_time=3.213e-04, forward_time=0.205, loss_att=35.196, acc=0.967, loss=35.196, backward_time=0.299, grad_norm=86.948, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.921e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:43:01,015 (trainer:732) INFO: 60epoch:train:3379-3941batch: iter_time=3.269e-04, forward_time=0.204, loss_att=33.520, acc=0.968, loss=33.520, backward_time=0.297, grad_norm=77.266, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.918e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.205<37134> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:49:19,650 (trainer:732) INFO: 60epoch:train:3942-4504batch: iter_time=3.252e-04, forward_time=0.204, loss_att=35.758, acc=0.967, loss=35.758, backward_time=0.298, grad_norm=79.630, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.915e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 07:55:37,202 (trainer:732) INFO: 60epoch:train:4505-5067batch: iter_time=3.283e-04, forward_time=0.204, loss_att=35.531, acc=0.967, loss=35.531, backward_time=0.297, grad_norm=84.428, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.912e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:01:57,682 (trainer:732) INFO: 60epoch:train:5068-5630batch: iter_time=3.379e-04, forward_time=0.205, loss_att=35.734, acc=0.967, loss=35.734, backward_time=0.300, grad_norm=81.259, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.909e-04, train_time=2.702 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:08:15,908 (trainer:732) INFO: 60epoch:train:5631-6193batch: iter_time=3.350e-04, forward_time=0.204, loss_att=35.013, acc=0.967, loss=35.013, backward_time=0.298, grad_norm=83.098, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.906e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:14:38,022 (trainer:732) INFO: 60epoch:train:6194-6756batch: iter_time=3.325e-04, forward_time=0.206, loss_att=35.719, acc=0.968, loss=35.719, backward_time=0.301, grad_norm=89.654, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.903e-04, train_time=2.714 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:20:56,417 (trainer:732) INFO: 60epoch:train:6757-7319batch: iter_time=3.311e-04, forward_time=0.205, loss_att=35.851, acc=0.966, loss=35.851, backward_time=0.298, grad_norm=87.021, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.900e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:27:16,851 (trainer:732) INFO: 60epoch:train:7320-7882batch: iter_time=3.207e-04, forward_time=0.206, loss_att=35.648, acc=0.967, loss=35.648, backward_time=0.300, grad_norm=88.319, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.897e-04, train_time=2.703 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:33:35,280 (trainer:732) INFO: 60epoch:train:7883-8445batch: iter_time=3.255e-04, forward_time=0.204, loss_att=34.741, acc=0.967, loss=34.741, backward_time=0.298, grad_norm=86.839, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.894e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:39:55,788 (trainer:732) INFO: 60epoch:train:8446-9008batch: iter_time=3.255e-04, forward_time=0.206, loss_att=36.211, acc=0.967, loss=36.211, backward_time=0.300, grad_norm=91.852, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.892e-04, train_time=2.702 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1849197:1849252 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:46:15,116 (trainer:732) INFO: 60epoch:train:9009-9571batch: iter_time=3.234e-04, forward_time=0.204, loss_att=36.133, acc=0.967, loss=36.133, backward_time=0.299, grad_norm=83.850, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.889e-04, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:52:36,134 (trainer:732) INFO: 60epoch:train:9572-10134batch: iter_time=3.242e-04, forward_time=0.206, loss_att=36.186, acc=0.967, loss=36.186, backward_time=0.301, grad_norm=82.334, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.886e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 08:58:57,228 (trainer:732) INFO: 60epoch:train:10135-10697batch: iter_time=3.316e-04, forward_time=0.205, loss_att=35.976, acc=0.967, loss=35.976, backward_time=0.300, grad_norm=82.515, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.883e-04, train_time=2.707 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:05:14,581 (trainer:732) INFO: 60epoch:train:10698-11260batch: iter_time=3.149e-04, forward_time=0.204, loss_att=35.467, acc=0.967, loss=35.467, backward_time=0.297, grad_norm=79.281, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.880e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:13:55,690 (trainer:338) INFO: 60epoch results: [train] iter_time=4.249e-04, forward_time=0.205, loss_att=35.433, acc=0.967, loss=35.433, backward_time=0.299, grad_norm=84.468, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.907e-04, train_time=2.760, time=2 hours, 9 minutes and 52.35 seconds, total_count=676320, gpu_max_cached_mem_GB=30.271, [valid] loss_att=10.238, acc=0.984, cer=0.020, wer=0.078, loss=10.238, time=4 minutes and 59.18 seconds, total_count=3360, gpu_max_cached_mem_GB=30.271, [att_plot] time=3 minutes and 19.91 seconds, total_count=0, gpu_max_cached_mem_GB=30.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:13:59,601 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:13:59,617 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/45epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:13:59,618 (trainer:458) INFO: The training was finished at 60 epochs +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:13:59,679 (average_nbest_models:69) INFO: Averaging 10best models: criterion="valid.acc": exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,764 (average_nbest_models:96) INFO: Accumulating encoder.encoders.0.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,767 (average_nbest_models:96) INFO: Accumulating encoder.encoders.1.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,769 (average_nbest_models:96) INFO: Accumulating encoder.encoders.2.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,771 (average_nbest_models:96) INFO: Accumulating encoder.encoders.3.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,774 (average_nbest_models:96) INFO: Accumulating encoder.encoders.4.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,776 (average_nbest_models:96) INFO: Accumulating encoder.encoders.5.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,778 (average_nbest_models:96) INFO: Accumulating encoder.encoders.6.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,780 (average_nbest_models:96) INFO: Accumulating encoder.encoders.7.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,783 (average_nbest_models:96) INFO: Accumulating encoder.encoders.8.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,785 (average_nbest_models:96) INFO: Accumulating encoder.encoders.9.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,787 (average_nbest_models:96) INFO: Accumulating encoder.encoders.10.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-02-27 09:14:08,789 (average_nbest_models:96) INFO: Accumulating encoder.encoders.11.conv_module.norm.num_batches_tracked instead of averaging +# Accounting: time=491528 threads=1 +# Ended (code 0) at Tue Feb 27 09:14:12 CST 2024, elapsed time 491528 seconds diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth new file mode 100644 index 0000000000000000000000000000000000000000..56e9181e5d525812968b501374ac2c282a0f811d --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea74ad613da66e5d904360cd0038724e9941d7e4d1828132a6fb8d7c9d018b61 +size 172358249 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth new file mode 100644 index 0000000000000000000000000000000000000000..56e9181e5d525812968b501374ac2c282a0f811d --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea74ad613da66e5d904360cd0038724e9941d7e4d1828132a6fb8d7c9d018b61 +size 172358249 diff --git a/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b57f292eef5cc25c79d59e927cc0d2c9a1234fe --- /dev/null +++ b/medium/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fff948ccc68c37ec9d32b89b18ead7d3fd3a8148f35e984996280b1556dd5ea6 +size 172367337