diff --git "a/logs/main/main_log.txt" "b/logs/main/main_log.txt" --- "a/logs/main/main_log.txt" +++ "b/logs/main/main_log.txt" @@ -9335,3 +9335,654 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default [default7]: iteration 3186/ 3814 | consumed samples: 1631232 | consumed tokens: 3340763136 | elapsed time per iteration (s): 5.40 | learning rate: 1.350E-05 | global batch size: 512 | lm loss: 4.269930E+00 | loss scale: 262144.0 | grad norm: 0.107 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.789 | TFLOPs: 71.60 | [default7]: iteration 3187/ 3814 | consumed samples: 1631744 | consumed tokens: 3341811712 | elapsed time per iteration (s): 5.40 | learning rate: 1.346E-05 | global batch size: 512 | lm loss: 4.226227E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.851 | TFLOPs: 71.64 | [default7]: iteration 3188/ 3814 | consumed samples: 1632256 | consumed tokens: 3342860288 | elapsed time per iteration (s): 5.44 | learning rate: 1.342E-05 | global batch size: 512 | lm loss: 4.220816E+00 | loss scale: 262144.0 | grad norm: 0.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.041 | TFLOPs: 71.03 | +[default7]: iteration 3189/ 3814 | consumed samples: 1632768 | consumed tokens: 3343908864 | elapsed time per iteration (s): 5.42 | learning rate: 1.337E-05 | global batch size: 512 | lm loss: 4.234136E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.473 | TFLOPs: 71.36 | +[default7]: iteration 3190/ 3814 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 5.41 | learning rate: 1.333E-05 | global batch size: 512 | lm loss: 4.267142E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.566 | TFLOPs: 71.43 | +[default7]: iteration 3191/ 3814 | consumed samples: 1633792 | consumed tokens: 3346006016 | elapsed time per iteration (s): 5.43 | learning rate: 1.329E-05 | global batch size: 512 | lm loss: 4.236691E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.271 | TFLOPs: 71.20 | +[default7]: iteration 3192/ 3814 | consumed samples: 1634304 | consumed tokens: 3347054592 | elapsed time per iteration (s): 5.43 | learning rate: 1.325E-05 | global batch size: 512 | lm loss: 4.248917E+00 | loss scale: 262144.0 | grad norm: 0.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.249 | TFLOPs: 71.19 | +[default7]: iteration 3193/ 3814 | consumed samples: 1634816 | consumed tokens: 3348103168 | elapsed time per iteration (s): 5.42 | learning rate: 1.321E-05 | global batch size: 512 | lm loss: 4.240632E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.444 | TFLOPs: 71.33 | +[default7]: iteration 3194/ 3814 | consumed samples: 1635328 | consumed tokens: 3349151744 | elapsed time per iteration (s): 5.45 | learning rate: 1.317E-05 | global batch size: 512 | lm loss: 4.233636E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.957 | TFLOPs: 70.97 | +[default7]: iteration 3195/ 3814 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 5.45 | learning rate: 1.313E-05 | global batch size: 512 | lm loss: 4.248126E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.897 | TFLOPs: 70.92 | +[default7]: iteration 3196/ 3814 | consumed samples: 1636352 | consumed tokens: 3351248896 | elapsed time per iteration (s): 5.44 | learning rate: 1.308E-05 | global batch size: 512 | lm loss: 4.221838E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.050 | TFLOPs: 71.04 | +[default7]: iteration 3197/ 3814 | consumed samples: 1636864 | consumed tokens: 3352297472 | elapsed time per iteration (s): 5.52 | learning rate: 1.304E-05 | global batch size: 512 | lm loss: 4.244923E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.759 | TFLOPs: 70.06 | +[default7]: iteration 3198/ 3814 | consumed samples: 1637376 | consumed tokens: 3353346048 | elapsed time per iteration (s): 5.45 | learning rate: 1.300E-05 | global batch size: 512 | lm loss: 4.229372E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.006 | TFLOPs: 71.00 | +[default7]: iteration 3199/ 3814 | consumed samples: 1637888 | consumed tokens: 3354394624 | elapsed time per iteration (s): 5.43 | learning rate: 1.296E-05 | global batch size: 512 | lm loss: 4.251491E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.257 | TFLOPs: 71.19 | +[default7]: iteration 3200/ 3814 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 5.44 | learning rate: 1.292E-05 | global batch size: 512 | lm loss: 4.233970E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.055 | TFLOPs: 71.04 | +[default7]: iteration 3201/ 3814 | consumed samples: 1638912 | consumed tokens: 3356491776 | elapsed time per iteration (s): 5.45 | learning rate: 1.288E-05 | global batch size: 512 | lm loss: 4.252962E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.957 | TFLOPs: 70.97 | +[default7]: iteration 3202/ 3814 | consumed samples: 1639424 | consumed tokens: 3357540352 | elapsed time per iteration (s): 5.46 | learning rate: 1.284E-05 | global batch size: 512 | lm loss: 4.257475E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.843 | TFLOPs: 70.88 | +[default7]: iteration 3203/ 3814 | consumed samples: 1639936 | consumed tokens: 3358588928 | elapsed time per iteration (s): 5.46 | learning rate: 1.280E-05 | global batch size: 512 | lm loss: 4.221294E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.855 | TFLOPs: 70.89 | +[default7]: iteration 3204/ 3814 | consumed samples: 1640448 | consumed tokens: 3359637504 | elapsed time per iteration (s): 5.45 | learning rate: 1.276E-05 | global batch size: 512 | lm loss: 4.246326E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.923 | TFLOPs: 70.94 | +[default7]: iteration 3205/ 3814 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 5.43 | learning rate: 1.272E-05 | global batch size: 512 | lm loss: 4.265960E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.234 | TFLOPs: 71.18 | +[default7]: iteration 3206/ 3814 | consumed samples: 1641472 | consumed tokens: 3361734656 | elapsed time per iteration (s): 5.45 | learning rate: 1.268E-05 | global batch size: 512 | lm loss: 4.243158E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.886 | TFLOPs: 70.91 | +[default7]: iteration 3207/ 3814 | consumed samples: 1641984 | consumed tokens: 3362783232 | elapsed time per iteration (s): 5.43 | learning rate: 1.264E-05 | global batch size: 512 | lm loss: 4.222745E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.240 | TFLOPs: 71.18 | +[default7]: iteration 3208/ 3814 | consumed samples: 1642496 | consumed tokens: 3363831808 | elapsed time per iteration (s): 5.42 | learning rate: 1.259E-05 | global batch size: 512 | lm loss: 4.231118E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.437 | TFLOPs: 71.33 | +[default7]: iteration 3209/ 3814 | consumed samples: 1643008 | consumed tokens: 3364880384 | elapsed time per iteration (s): 5.43 | learning rate: 1.255E-05 | global batch size: 512 | lm loss: 4.251276E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.212 | TFLOPs: 71.16 | +[default7]: iteration 3210/ 3814 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 5.43 | learning rate: 1.251E-05 | global batch size: 512 | lm loss: 4.233292E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.243 | TFLOPs: 71.18 | +[default7]: iteration 3211/ 3814 | consumed samples: 1644032 | consumed tokens: 3366977536 | elapsed time per iteration (s): 5.42 | learning rate: 1.247E-05 | global batch size: 512 | lm loss: 4.237645E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.401 | TFLOPs: 71.30 | +[default7]: iteration 3212/ 3814 | consumed samples: 1644544 | consumed tokens: 3368026112 | elapsed time per iteration (s): 5.47 | learning rate: 1.243E-05 | global batch size: 512 | lm loss: 4.255022E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.630 | TFLOPs: 70.72 | +[default7]: iteration 3213/ 3814 | consumed samples: 1645056 | consumed tokens: 3369074688 | elapsed time per iteration (s): 5.44 | learning rate: 1.239E-05 | global batch size: 512 | lm loss: 4.250171E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.173 | TFLOPs: 71.13 | +[default7]: iteration 3214/ 3814 | consumed samples: 1645568 | consumed tokens: 3370123264 | elapsed time per iteration (s): 5.44 | learning rate: 1.235E-05 | global batch size: 512 | lm loss: 4.242435E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.073 | TFLOPs: 71.05 | +[default7]: iteration 3215/ 3814 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 5.43 | learning rate: 1.231E-05 | global batch size: 512 | lm loss: 4.248810E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.276 | TFLOPs: 71.21 | +[default7]: iteration 3216/ 3814 | consumed samples: 1646592 | consumed tokens: 3372220416 | elapsed time per iteration (s): 5.44 | learning rate: 1.227E-05 | global batch size: 512 | lm loss: 4.235618E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.124 | TFLOPs: 71.09 | +[default7]: iteration 3217/ 3814 | consumed samples: 1647104 | consumed tokens: 3373268992 | elapsed time per iteration (s): 5.46 | learning rate: 1.223E-05 | global batch size: 512 | lm loss: 4.255035E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.716 | TFLOPs: 70.78 | +[default7]: iteration 3218/ 3814 | consumed samples: 1647616 | consumed tokens: 3374317568 | elapsed time per iteration (s): 5.45 | learning rate: 1.219E-05 | global batch size: 512 | lm loss: 4.244114E+00 | loss scale: 262144.0 | grad norm: 0.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.981 | TFLOPs: 70.98 | +[default7]: iteration 3219/ 3814 | consumed samples: 1648128 | consumed tokens: 3375366144 | elapsed time per iteration (s): 5.44 | learning rate: 1.215E-05 | global batch size: 512 | lm loss: 4.261461E+00 | loss scale: 262144.0 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.096 | TFLOPs: 71.07 | +[default7]: iteration 3220/ 3814 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 5.42 | learning rate: 1.211E-05 | global batch size: 512 | lm loss: 4.229345E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.465 | TFLOPs: 71.35 | +[default7]: iteration 3221/ 3814 | consumed samples: 1649152 | consumed tokens: 3377463296 | elapsed time per iteration (s): 5.43 | learning rate: 1.207E-05 | global batch size: 512 | lm loss: 4.237742E+00 | loss scale: 262144.0 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.284 | TFLOPs: 71.21 | +[default7]: iteration 3222/ 3814 | consumed samples: 1649664 | consumed tokens: 3378511872 | elapsed time per iteration (s): 5.42 | learning rate: 1.203E-05 | global batch size: 512 | lm loss: 4.231729E+00 | loss scale: 262144.0 | grad norm: 0.109 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.471 | TFLOPs: 71.35 | +[default7]: iteration 3223/ 3814 | consumed samples: 1650176 | consumed tokens: 3379560448 | elapsed time per iteration (s): 5.42 | learning rate: 1.199E-05 | global batch size: 512 | lm loss: 4.255119E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.434 | TFLOPs: 71.33 | +[default7]: iteration 3224/ 3814 | consumed samples: 1650688 | consumed tokens: 3380609024 | elapsed time per iteration (s): 5.42 | learning rate: 1.195E-05 | global batch size: 512 | lm loss: 4.247835E+00 | loss scale: 262144.0 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.508 | TFLOPs: 71.38 | +[default7]: iteration 3225/ 3814 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 5.41 | learning rate: 1.192E-05 | global batch size: 512 | lm loss: 4.248127E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.616 | TFLOPs: 71.46 | +[default7]: iteration 3226/ 3814 | consumed samples: 1651712 | consumed tokens: 3382706176 | elapsed time per iteration (s): 5.42 | learning rate: 1.188E-05 | global batch size: 512 | lm loss: 4.240874E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.528 | TFLOPs: 71.40 | +[default7]: iteration 3227/ 3814 | consumed samples: 1652224 | consumed tokens: 3383754752 | elapsed time per iteration (s): 5.42 | learning rate: 1.184E-05 | global batch size: 512 | lm loss: 4.225278E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.523 | TFLOPs: 71.39 | +[default7]: iteration 3228/ 3814 | consumed samples: 1652736 | consumed tokens: 3384803328 | elapsed time per iteration (s): 5.40 | learning rate: 1.180E-05 | global batch size: 512 | lm loss: 4.214810E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.735 | TFLOPs: 71.55 | +[default7]: iteration 3229/ 3814 | consumed samples: 1653248 | consumed tokens: 3385851904 | elapsed time per iteration (s): 5.41 | learning rate: 1.176E-05 | global batch size: 512 | lm loss: 4.248631E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.602 | TFLOPs: 71.45 | +[default7]: iteration 3230/ 3814 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 5.42 | learning rate: 1.172E-05 | global batch size: 512 | lm loss: 4.239211E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.502 | TFLOPs: 71.38 | +[default7]: iteration 3231/ 3814 | consumed samples: 1654272 | consumed tokens: 3387949056 | elapsed time per iteration (s): 5.44 | learning rate: 1.168E-05 | global batch size: 512 | lm loss: 4.206187E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.042 | TFLOPs: 71.03 | +[default7]: iteration 3232/ 3814 | consumed samples: 1654784 | consumed tokens: 3388997632 | elapsed time per iteration (s): 5.42 | learning rate: 1.164E-05 | global batch size: 512 | lm loss: 4.246992E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.546 | TFLOPs: 71.41 | +[default7]: iteration 3233/ 3814 | consumed samples: 1655296 | consumed tokens: 3390046208 | elapsed time per iteration (s): 5.42 | learning rate: 1.160E-05 | global batch size: 512 | lm loss: 4.240926E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.462 | TFLOPs: 71.35 | +[default7]: iteration 3234/ 3814 | consumed samples: 1655808 | consumed tokens: 3391094784 | elapsed time per iteration (s): 5.43 | learning rate: 1.156E-05 | global batch size: 512 | lm loss: 4.223284E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.289 | TFLOPs: 71.22 | +[default7]: iteration 3235/ 3814 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 5.44 | learning rate: 1.152E-05 | global batch size: 512 | lm loss: 4.223007E+00 | loss scale: 262144.0 | grad norm: 0.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.204 | TFLOPs: 71.15 | +[default7]: iteration 3236/ 3814 | consumed samples: 1656832 | consumed tokens: 3393191936 | elapsed time per iteration (s): 5.43 | learning rate: 1.149E-05 | global batch size: 512 | lm loss: 4.239875E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.333 | TFLOPs: 71.25 | +[default7]: iteration 3237/ 3814 | consumed samples: 1657344 | consumed tokens: 3394240512 | elapsed time per iteration (s): 5.42 | learning rate: 1.145E-05 | global batch size: 512 | lm loss: 4.241911E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.392 | TFLOPs: 71.30 | +[default7]: iteration 3238/ 3814 | consumed samples: 1657856 | consumed tokens: 3395289088 | elapsed time per iteration (s): 5.43 | learning rate: 1.141E-05 | global batch size: 512 | lm loss: 4.247677E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.318 | TFLOPs: 71.24 | +[default7]: iteration 3239/ 3814 | consumed samples: 1658368 | consumed tokens: 3396337664 | elapsed time per iteration (s): 5.42 | learning rate: 1.137E-05 | global batch size: 512 | lm loss: 4.244621E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.391 | TFLOPs: 71.29 | +[default7]: iteration 3240/ 3814 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 5.43 | learning rate: 1.133E-05 | global batch size: 512 | lm loss: 4.244741E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.255 | TFLOPs: 71.19 | +[default7]: iteration 3241/ 3814 | consumed samples: 1659392 | consumed tokens: 3398434816 | elapsed time per iteration (s): 5.43 | learning rate: 1.129E-05 | global batch size: 512 | lm loss: 4.218042E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.308 | TFLOPs: 71.23 | +[default7]: iteration 3242/ 3814 | consumed samples: 1659904 | consumed tokens: 3399483392 | elapsed time per iteration (s): 5.44 | learning rate: 1.125E-05 | global batch size: 512 | lm loss: 4.231806E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.193 | TFLOPs: 71.15 | +[default7]: iteration 3243/ 3814 | consumed samples: 1660416 | consumed tokens: 3400531968 | elapsed time per iteration (s): 5.44 | learning rate: 1.122E-05 | global batch size: 512 | lm loss: 4.230334E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.049 | TFLOPs: 71.04 | +[default7]: iteration 3244/ 3814 | consumed samples: 1660928 | consumed tokens: 3401580544 | elapsed time per iteration (s): 5.42 | learning rate: 1.118E-05 | global batch size: 512 | lm loss: 4.243740E+00 | loss scale: 262144.0 | grad norm: 0.086 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.431 | TFLOPs: 71.33 | +[default7]: iteration 3245/ 3814 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 5.42 | learning rate: 1.114E-05 | global batch size: 512 | lm loss: 4.204631E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.398 | TFLOPs: 71.30 | +[default7]: iteration 3246/ 3814 | consumed samples: 1661952 | consumed tokens: 3403677696 | elapsed time per iteration (s): 5.44 | learning rate: 1.110E-05 | global batch size: 512 | lm loss: 4.237994E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.166 | TFLOPs: 71.12 | +[default7]: iteration 3247/ 3814 | consumed samples: 1662464 | consumed tokens: 3404726272 | elapsed time per iteration (s): 5.43 | learning rate: 1.106E-05 | global batch size: 512 | lm loss: 4.236749E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.312 | TFLOPs: 71.24 | +[default7]: iteration 3248/ 3814 | consumed samples: 1662976 | consumed tokens: 3405774848 | elapsed time per iteration (s): 5.43 | learning rate: 1.102E-05 | global batch size: 512 | lm loss: 4.239882E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.291 | TFLOPs: 71.22 | +[default7]: iteration 3249/ 3814 | consumed samples: 1663488 | consumed tokens: 3406823424 | elapsed time per iteration (s): 5.43 | learning rate: 1.099E-05 | global batch size: 512 | lm loss: 4.248704E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.357 | TFLOPs: 71.27 | +[default1]:[2023-02-16 16:15:04,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_01-model_states.pt... +[default0]:saving checkpoint at iteration 3250 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default0]:[2023-02-16 16:15:03,998] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3250 is begin to save! +[default0]:[2023-02-16 16:15:04,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_00-model_states.pt... +[default7]: iteration 3250/ 3814 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 5.43 | learning rate: 1.095E-05 | global batch size: 512 | lm loss: 4.236396E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.295 | TFLOPs: 71.22 | +[default1]:[2023-02-16 16:15:04,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_01-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_04-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,266] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_05-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_06-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_07-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_08-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_09-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_10-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_11-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_12-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,666] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_13-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,759] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_14-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:04,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_15-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_16-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:04,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,933] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:04,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:04,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:04,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_17-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:04,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_18-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_19-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_20-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:05,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_21-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_22-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:05,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_23-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,349] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_24-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:05,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_25-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,562] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_26-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_27-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:05,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_28-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_29-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_30-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:05,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_31-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:05,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_32-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:05,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:05,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:06,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:06,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_01-model_states.pt... +[default0]:[2023-02-16 16:15:05,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_33-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:05,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_00-model_states.pt... +[default1]:[2023-02-16 16:15:06,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:06,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_01-model_states.pt... +[default1]:[2023-02-16 16:15:06,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_01-model_states.pt. +[default1]:[2023-02-16 16:15:06,065] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt +[default1]:[2023-02-16 16:15:06,066] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt... +[default1]:[2023-02-16 16:15:06,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_01_model_states.pt. +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:15:06,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_34-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:06,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:06,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_35-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:06,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_00-model_states.pt... +[default0]:[2023-02-16 16:15:06,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/layer_37-model_00-model_states.pt. +[default0]:[2023-02-16 16:15:06,111] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt +[default0]:[2023-02-16 16:15:06,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt... +[default0]:[2023-02-16 16:15:06,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/mp_rank_00_model_states.pt. +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt... +[default7]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt... +[default4]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt... +[default0]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt... +[default5]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt... +[default6]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt... +[default2]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt... +[default3]:[2023-02-16 16:15:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt... +[default1]:[2023-02-16 16:15:06,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,603] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,624] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,625] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,638] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,642] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,695] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,673] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,667] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,675] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,669] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,697] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,682] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_9_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,699] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_18_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,750] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,698] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,766] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,696] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,715] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_13_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,764] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,735] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,764] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,713] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_5_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,749] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,775] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,767] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_8_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,731] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,766] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_30_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,795] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_6_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,731] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_16_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,762] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_28_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,808] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_10_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,818] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_19_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,782] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,792] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_29_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,793] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_17_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,832] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,845] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,804] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_11_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,838] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,812] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,803] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,853] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_24_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,845] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_27_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,792] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_3_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,811] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_1_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,867] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_7_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,784] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_2_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,844] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_4_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,828] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_15_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,813] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_0_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,826] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_31_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,912] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_22_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,878] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]:[2023-02-16 16:15:06,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt. +[default0]:[2023-02-16 16:15:06,903] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_00_optim_states.pt +[default0]:[2023-02-16 16:15:06,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,850] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_14_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default6]:[2023-02-16 16:15:06,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt. +[default6]:[2023-02-16 16:15:06,920] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_00_optim_states.pt +[default6]:[2023-02-16 16:15:06,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default3]:[2023-02-16 16:15:06,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt. +[default3]:[2023-02-16 16:15:06,894] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_01_optim_states.pt +[default3]:[2023-02-16 16:15:06,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default7]:[2023-02-16 16:15:06,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt. +[default7]:[2023-02-16 16:15:06,921] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_23_mp_rank_01_optim_states.pt +[default7]:[2023-02-16 16:15:06,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,927] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_25_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default5]:[2023-02-16 16:15:06,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt. +[default5]:[2023-02-16 16:15:06,950] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_01_optim_states.pt +[default5]:[2023-02-16 16:15:06,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,868] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_12_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default2]:[2023-02-16 16:15:06,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt. +[default2]:[2023-02-16 16:15:06,949] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_21_mp_rank_00_optim_states.pt +[default2]:[2023-02-16 16:15:06,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default1]:[2023-02-16 16:15:06,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt. +[default1]:[2023-02-16 16:15:06,971] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_20_mp_rank_01_optim_states.pt +[default1]:[2023-02-16 16:15:06,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default4]:[2023-02-16 16:15:06,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt. +[default4]:[2023-02-16 16:15:06,969] [INFO] [engine.py:3199:_save_zero_checkpoint] zero checkpoint saved /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main/global_step3250/zero_pp_rank_26_mp_rank_00_optim_states.pt +[default4]:[2023-02-16 16:15:06,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3250 is ready now! +[default0]: successfully saved checkpoint at iteration 3250 to /gpfsscratch/rech/ajs/commun/checkpoints/tr14-2B7-lr0.0017-init0.1-inpm10-outm10-atnm10-mup/checkpoints/main +[default7]:time (ms) | save-checkpoint: 2975.48 +[default7]: iteration 3251/ 3814 | consumed samples: 1664512 | consumed tokens: 3408920576 | elapsed time per iteration (s): 8.43 | learning rate: 1.091E-05 | global batch size: 512 | lm loss: 4.201881E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 60.771 | TFLOPs: 45.90 | +[default7]: iteration 3252/ 3814 | consumed samples: 1665024 | consumed tokens: 3409969152 | elapsed time per iteration (s): 5.45 | learning rate: 1.087E-05 | global batch size: 512 | lm loss: 4.233357E+00 | loss scale: 262144.0 | grad norm: 0.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.917 | TFLOPs: 70.94 | +[default7]: iteration 3253/ 3814 | consumed samples: 1665536 | consumed tokens: 3411017728 | elapsed time per iteration (s): 5.45 | learning rate: 1.083E-05 | global batch size: 512 | lm loss: 4.232371E+00 | loss scale: 262144.0 | grad norm: 0.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.969 | TFLOPs: 70.98 | +[default7]: iteration 3254/ 3814 | consumed samples: 1666048 | consumed tokens: 3412066304 | elapsed time per iteration (s): 5.45 | learning rate: 1.080E-05 | global batch size: 512 | lm loss: 4.241189E+00 | loss scale: 262144.0 | grad norm: 0.097 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.887 | TFLOPs: 70.91 | +[default7]: iteration 3255/ 3814 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 5.44 | learning rate: 1.076E-05 | global batch size: 512 | lm loss: 4.239332E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.168 | TFLOPs: 71.13 | +[default7]: iteration 3256/ 3814 | consumed samples: 1667072 | consumed tokens: 3414163456 | elapsed time per iteration (s): 5.45 | learning rate: 1.072E-05 | global batch size: 512 | lm loss: 4.261608E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.020 | TFLOPs: 71.01 | +[default7]: iteration 3257/ 3814 | consumed samples: 1667584 | consumed tokens: 3415212032 | elapsed time per iteration (s): 5.42 | learning rate: 1.068E-05 | global batch size: 512 | lm loss: 4.234900E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.524 | TFLOPs: 71.40 | +[default7]: iteration 3258/ 3814 | consumed samples: 1668096 | consumed tokens: 3416260608 | elapsed time per iteration (s): 5.42 | learning rate: 1.065E-05 | global batch size: 512 | lm loss: 4.216873E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.517 | TFLOPs: 71.39 | +[default7]: iteration 3259/ 3814 | consumed samples: 1668608 | consumed tokens: 3417309184 | elapsed time per iteration (s): 5.43 | learning rate: 1.061E-05 | global batch size: 512 | lm loss: 4.232929E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.350 | TFLOPs: 71.26 | +[default7]: iteration 3260/ 3814 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 5.42 | learning rate: 1.057E-05 | global batch size: 512 | lm loss: 4.230484E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.506 | TFLOPs: 71.38 | +[default7]: iteration 3261/ 3814 | consumed samples: 1669632 | consumed tokens: 3419406336 | elapsed time per iteration (s): 5.42 | learning rate: 1.054E-05 | global batch size: 512 | lm loss: 4.220443E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.384 | TFLOPs: 71.29 | +[default7]: iteration 3262/ 3814 | consumed samples: 1670144 | consumed tokens: 3420454912 | elapsed time per iteration (s): 5.42 | learning rate: 1.050E-05 | global batch size: 512 | lm loss: 4.266650E+00 | loss scale: 262144.0 | grad norm: 0.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.445 | TFLOPs: 71.33 | +[default7]: iteration 3263/ 3814 | consumed samples: 1670656 | consumed tokens: 3421503488 | elapsed time per iteration (s): 5.42 | learning rate: 1.046E-05 | global batch size: 512 | lm loss: 4.233566E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.526 | TFLOPs: 71.40 | +[default7]: iteration 3264/ 3814 | consumed samples: 1671168 | consumed tokens: 3422552064 | elapsed time per iteration (s): 5.42 | learning rate: 1.042E-05 | global batch size: 512 | lm loss: 4.235141E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.538 | TFLOPs: 71.41 | +[default7]: iteration 3265/ 3814 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 5.43 | learning rate: 1.039E-05 | global batch size: 512 | lm loss: 4.240074E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.264 | TFLOPs: 71.20 | +[default7]: iteration 3266/ 3814 | consumed samples: 1672192 | consumed tokens: 3424649216 | elapsed time per iteration (s): 5.44 | learning rate: 1.035E-05 | global batch size: 512 | lm loss: 4.218791E+00 | loss scale: 262144.0 | grad norm: 0.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.159 | TFLOPs: 71.12 | +[default7]: iteration 3267/ 3814 | consumed samples: 1672704 | consumed tokens: 3425697792 | elapsed time per iteration (s): 5.43 | learning rate: 1.031E-05 | global batch size: 512 | lm loss: 4.217254E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.250 | TFLOPs: 71.19 | +[default7]: iteration 3268/ 3814 | consumed samples: 1673216 | consumed tokens: 3426746368 | elapsed time per iteration (s): 5.44 | learning rate: 1.028E-05 | global batch size: 512 | lm loss: 4.254647E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.132 | TFLOPs: 71.10 | +[default7]: iteration 3269/ 3814 | consumed samples: 1673728 | consumed tokens: 3427794944 | elapsed time per iteration (s): 5.43 | learning rate: 1.024E-05 | global batch size: 512 | lm loss: 4.263520E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.286 | TFLOPs: 71.22 | +[default7]: iteration 3270/ 3814 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 5.44 | learning rate: 1.020E-05 | global batch size: 512 | lm loss: 4.188801E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.195 | TFLOPs: 71.15 | +[default7]: iteration 3271/ 3814 | consumed samples: 1674752 | consumed tokens: 3429892096 | elapsed time per iteration (s): 5.44 | learning rate: 1.017E-05 | global batch size: 512 | lm loss: 4.217118E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.129 | TFLOPs: 71.10 | +[default7]: iteration 3272/ 3814 | consumed samples: 1675264 | consumed tokens: 3430940672 | elapsed time per iteration (s): 5.43 | learning rate: 1.013E-05 | global batch size: 512 | lm loss: 4.217278E+00 | loss scale: 262144.0 | grad norm: 0.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.216 | TFLOPs: 71.16 | +[default7]: iteration 3273/ 3814 | consumed samples: 1675776 | consumed tokens: 3431989248 | elapsed time per iteration (s): 5.44 | learning rate: 1.009E-05 | global batch size: 512 | lm loss: 4.228255E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.068 | TFLOPs: 71.05 | +[default7]: iteration 3274/ 3814 | consumed samples: 1676288 | consumed tokens: 3433037824 | elapsed time per iteration (s): 5.42 | learning rate: 1.006E-05 | global batch size: 512 | lm loss: 4.226753E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.487 | TFLOPs: 71.37 | +[default7]: iteration 3275/ 3814 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 5.43 | learning rate: 1.002E-05 | global batch size: 512 | lm loss: 4.234141E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.372 | TFLOPs: 71.28 | +[default7]: iteration 3276/ 3814 | consumed samples: 1677312 | consumed tokens: 3435134976 | elapsed time per iteration (s): 5.43 | learning rate: 9.983E-06 | global batch size: 512 | lm loss: 4.244141E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.226 | TFLOPs: 71.17 | +[default7]: iteration 3277/ 3814 | consumed samples: 1677824 | consumed tokens: 3436183552 | elapsed time per iteration (s): 5.44 | learning rate: 9.947E-06 | global batch size: 512 | lm loss: 4.255011E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.094 | TFLOPs: 71.07 | +[default7]: iteration 3278/ 3814 | consumed samples: 1678336 | consumed tokens: 3437232128 | elapsed time per iteration (s): 5.45 | learning rate: 9.911E-06 | global batch size: 512 | lm loss: 4.236968E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.022 | TFLOPs: 71.02 | +[default7]: iteration 3279/ 3814 | consumed samples: 1678848 | consumed tokens: 3438280704 | elapsed time per iteration (s): 5.45 | learning rate: 9.875E-06 | global batch size: 512 | lm loss: 4.254960E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.941 | TFLOPs: 70.95 | +[default7]: iteration 3280/ 3814 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 5.43 | learning rate: 9.838E-06 | global batch size: 512 | lm loss: 4.255818E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.211 | TFLOPs: 71.16 | +[default7]: iteration 3281/ 3814 | consumed samples: 1679872 | consumed tokens: 3440377856 | elapsed time per iteration (s): 5.45 | learning rate: 9.802E-06 | global batch size: 512 | lm loss: 4.195121E+00 | loss scale: 262144.0 | grad norm: 0.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.016 | TFLOPs: 71.01 | +[default7]: iteration 3282/ 3814 | consumed samples: 1680384 | consumed tokens: 3441426432 | elapsed time per iteration (s): 5.43 | learning rate: 9.766E-06 | global batch size: 512 | lm loss: 4.225697E+00 | loss scale: 262144.0 | grad norm: 0.095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.308 | TFLOPs: 71.23 | +[default7]: iteration 3283/ 3814 | consumed samples: 1680896 | consumed tokens: 3442475008 | elapsed time per iteration (s): 5.46 | learning rate: 9.731E-06 | global batch size: 512 | lm loss: 4.215566E+00 | loss scale: 262144.0 | grad norm: 0.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.784 | TFLOPs: 70.84 | +[default7]: iteration 3284/ 3814 | consumed samples: 1681408 | consumed tokens: 3443523584 | elapsed time per iteration (s): 5.45 | learning rate: 9.695E-06 | global batch size: 512 | lm loss: 4.237579E+00 | loss scale: 262144.0 | grad norm: 0.096 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.897 | TFLOPs: 70.92 | +[default7]: iteration 3285/ 3814 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 5.45 | learning rate: 9.659E-06 | global batch size: 512 | lm loss: 4.223916E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.902 | TFLOPs: 70.93 | +[default7]: iteration 3286/ 3814 | consumed samples: 1682432 | consumed tokens: 3445620736 | elapsed time per iteration (s): 5.44 | learning rate: 9.623E-06 | global batch size: 512 | lm loss: 4.272243E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.124 | TFLOPs: 71.09 | +[default7]: iteration 3287/ 3814 | consumed samples: 1682944 | consumed tokens: 3446669312 | elapsed time per iteration (s): 5.46 | learning rate: 9.587E-06 | global batch size: 512 | lm loss: 4.213273E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.760 | TFLOPs: 70.82 | +[default7]: iteration 3288/ 3814 | consumed samples: 1683456 | consumed tokens: 3447717888 | elapsed time per iteration (s): 5.45 | learning rate: 9.552E-06 | global batch size: 512 | lm loss: 4.193983E+00 | loss scale: 262144.0 | grad norm: 0.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.971 | TFLOPs: 70.98 | +[default7]: iteration 3289/ 3814 | consumed samples: 1683968 | consumed tokens: 3448766464 | elapsed time per iteration (s): 5.46 | learning rate: 9.516E-06 | global batch size: 512 | lm loss: 4.246660E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.735 | TFLOPs: 70.80 | +[default7]: iteration 3290/ 3814 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 5.45 | learning rate: 9.481E-06 | global batch size: 512 | lm loss: 4.207823E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.026 | TFLOPs: 71.02 | +[default7]: iteration 3291/ 3814 | consumed samples: 1684992 | consumed tokens: 3450863616 | elapsed time per iteration (s): 5.45 | learning rate: 9.445E-06 | global batch size: 512 | lm loss: 4.239370E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.979 | TFLOPs: 70.98 | +[default7]: iteration 3292/ 3814 | consumed samples: 1685504 | consumed tokens: 3451912192 | elapsed time per iteration (s): 5.45 | learning rate: 9.410E-06 | global batch size: 512 | lm loss: 4.238070E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 93.943 | TFLOPs: 70.96 | +[default7]: iteration 3293/ 3814 | consumed samples: 1686016 | consumed tokens: 3452960768 | elapsed time per iteration (s): 5.43 | learning rate: 9.375E-06 | global batch size: 512 | lm loss: 4.230848E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.332 | TFLOPs: 71.25 | +[default7]: iteration 3294/ 3814 | consumed samples: 1686528 | consumed tokens: 3454009344 | elapsed time per iteration (s): 5.43 | learning rate: 9.340E-06 | global batch size: 512 | lm loss: 4.228353E+00 | loss scale: 262144.0 | grad norm: 0.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.348 | TFLOPs: 71.26 | +[default7]: iteration 3295/ 3814 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 5.43 | learning rate: 9.304E-06 | global batch size: 512 | lm loss: 4.232199E+00 | loss scale: 262144.0 | grad norm: 0.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.268 | TFLOPs: 71.20 | +[default7]: iteration 3296/ 3814 | consumed samples: 1687552 | consumed tokens: 3456106496 | elapsed time per iteration (s): 5.43 | learning rate: 9.269E-06 | global batch size: 512 | lm loss: 4.251224E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.335 | TFLOPs: 71.25 | +[default7]: iteration 3297/ 3814 | consumed samples: 1688064 | consumed tokens: 3457155072 | elapsed time per iteration (s): 5.43 | learning rate: 9.234E-06 | global batch size: 512 | lm loss: 4.231680E+00 | loss scale: 262144.0 | grad norm: 0.092 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.325 | TFLOPs: 71.24 | +[default7]: iteration 3298/ 3814 | consumed samples: 1688576 | consumed tokens: 3458203648 | elapsed time per iteration (s): 5.43 | learning rate: 9.199E-06 | global batch size: 512 | lm loss: 4.241086E+00 | loss scale: 262144.0 | grad norm: 0.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.310 | TFLOPs: 71.23 | +[default7]: iteration 3299/ 3814 | consumed samples: 1689088 | consumed tokens: 3459252224 | elapsed time per iteration (s): 5.43 | learning rate: 9.164E-06 | global batch size: 512 | lm loss: 4.229588E+00 | loss scale: 262144.0 | grad norm: 0.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 94.337 | TFLOPs: 71.25 | +slurmstepd: error: *** STEP 1219022.0 ON jean-zay-iam09 CANCELLED AT 2023-02-16T16:19:36 DUE TO TIME LIMIT *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954560 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591365 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111818 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454531 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145112 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938150 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591366 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856569 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111819 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454532 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591367 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938151 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856570 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954561 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938152 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111820 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111821 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954562 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145113 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454533 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856571 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591368 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454534 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454535 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856572 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938153 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145114 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954563 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111822 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591369 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856573 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938154 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954564 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112502 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145115 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111823 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938155 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454536 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591370 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954565 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112503 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145116 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938156 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454537 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591371 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111824 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112504 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145117 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938157 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454538 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954566 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591372 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112505 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145118 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111825 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145119 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112506 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112507 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954567 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112508 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112509 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856574 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856575 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856576 closing signal SIGTERM +srun: Job step aborted: Waiting up to 62 seconds for job step to finish. +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954560 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954561 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954562 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954563 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954564 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954565 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954566 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1954567 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112502 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112503 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112504 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112505 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112506 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112507 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112508 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 112509 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145112 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145113 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145114 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145115 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145116 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145117 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145118 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 145119 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856569 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856570 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856571 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856572 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856573 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856574 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856575 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1856576 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591365 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591366 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591367 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591368 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591369 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591370 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591371 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1591372 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454531 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454532 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454533 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454534 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454535 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454536 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454537 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1454538 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111818 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111819 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111820 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111821 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111822 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111823 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111824 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 111825 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938150 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938151 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938152 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938153 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938154 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938155 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938156 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3938157 closing signal SIGTERM