|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 461, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021691973969631237, |
|
"grad_norm": 11.479755487865775, |
|
"learning_rate": 1.0638297872340425e-08, |
|
"loss": 1.204, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010845986984815618, |
|
"grad_norm": 11.719854924785, |
|
"learning_rate": 5.3191489361702123e-08, |
|
"loss": 1.2705, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.021691973969631236, |
|
"grad_norm": 12.5013105505897, |
|
"learning_rate": 1.0638297872340425e-07, |
|
"loss": 1.2569, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03253796095444685, |
|
"grad_norm": 10.664645875250606, |
|
"learning_rate": 1.5957446808510638e-07, |
|
"loss": 1.2504, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04338394793926247, |
|
"grad_norm": 9.933034533041308, |
|
"learning_rate": 2.127659574468085e-07, |
|
"loss": 1.2132, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05422993492407809, |
|
"grad_norm": 5.664415659701916, |
|
"learning_rate": 2.659574468085106e-07, |
|
"loss": 1.1718, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0650759219088937, |
|
"grad_norm": 4.799955074967526, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 1.1492, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07592190889370933, |
|
"grad_norm": 3.5056771066313868, |
|
"learning_rate": 3.7234042553191484e-07, |
|
"loss": 1.1193, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08676789587852494, |
|
"grad_norm": 2.758357067802773, |
|
"learning_rate": 4.25531914893617e-07, |
|
"loss": 1.0995, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09761388286334056, |
|
"grad_norm": 2.512596535973654, |
|
"learning_rate": 4.787234042553192e-07, |
|
"loss": 1.0789, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10845986984815618, |
|
"grad_norm": 2.4219547091489244, |
|
"learning_rate": 4.999352212103373e-07, |
|
"loss": 1.0741, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1193058568329718, |
|
"grad_norm": 2.359757061351952, |
|
"learning_rate": 4.995394723941067e-07, |
|
"loss": 1.0585, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1301518438177874, |
|
"grad_norm": 2.4328040984593247, |
|
"learning_rate": 4.98784531935359e-07, |
|
"loss": 1.0568, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14099783080260303, |
|
"grad_norm": 2.3052389483909543, |
|
"learning_rate": 4.976714865090826e-07, |
|
"loss": 1.0638, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15184381778741865, |
|
"grad_norm": 2.198699363060523, |
|
"learning_rate": 4.96201938253052e-07, |
|
"loss": 1.0349, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16268980477223427, |
|
"grad_norm": 2.138037840029951, |
|
"learning_rate": 4.943780024616802e-07, |
|
"loss": 1.034, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1735357917570499, |
|
"grad_norm": 2.1989759782977076, |
|
"learning_rate": 4.922023045412265e-07, |
|
"loss": 1.0295, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1843817787418655, |
|
"grad_norm": 2.2941860079446235, |
|
"learning_rate": 4.896779762307389e-07, |
|
"loss": 1.0262, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19522776572668113, |
|
"grad_norm": 2.1481967771932773, |
|
"learning_rate": 4.868086510941716e-07, |
|
"loss": 1.0154, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.20607375271149675, |
|
"grad_norm": 2.160671871794234, |
|
"learning_rate": 4.835984592901677e-07, |
|
"loss": 1.0468, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21691973969631237, |
|
"grad_norm": 2.1550419124695783, |
|
"learning_rate": 4.800520216270341e-07, |
|
"loss": 1.0336, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.227765726681128, |
|
"grad_norm": 2.2253199590666157, |
|
"learning_rate": 4.7617444291146555e-07, |
|
"loss": 1.0224, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2386117136659436, |
|
"grad_norm": 2.183801883640947, |
|
"learning_rate": 4.7197130460059377e-07, |
|
"loss": 1.0139, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24945770065075923, |
|
"grad_norm": 2.177485956300299, |
|
"learning_rate": 4.6744865676793666e-07, |
|
"loss": 1.0053, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2603036876355748, |
|
"grad_norm": 2.1256298478006856, |
|
"learning_rate": 4.6261300939481274e-07, |
|
"loss": 0.9914, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.27114967462039047, |
|
"grad_norm": 2.1075218318516886, |
|
"learning_rate": 4.574713229997563e-07, |
|
"loss": 1.0006, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.28199566160520606, |
|
"grad_norm": 2.1304778495096457, |
|
"learning_rate": 4.520309986194201e-07, |
|
"loss": 1.0338, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2928416485900217, |
|
"grad_norm": 2.1650488920006046, |
|
"learning_rate": 4.462998671553897e-07, |
|
"loss": 1.0017, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3036876355748373, |
|
"grad_norm": 2.214789820606799, |
|
"learning_rate": 4.4028617810224115e-07, |
|
"loss": 1.0101, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.31453362255965295, |
|
"grad_norm": 2.157917041478876, |
|
"learning_rate": 4.3399858767306927e-07, |
|
"loss": 0.989, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.32537960954446854, |
|
"grad_norm": 2.092011368725107, |
|
"learning_rate": 4.2744614633957723e-07, |
|
"loss": 1.0115, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3362255965292842, |
|
"grad_norm": 2.219786897391391, |
|
"learning_rate": 4.206382858046635e-07, |
|
"loss": 0.9853, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3470715835140998, |
|
"grad_norm": 2.0979071528204756, |
|
"learning_rate": 4.135848054262578e-07, |
|
"loss": 1.0163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3579175704989154, |
|
"grad_norm": 2.1698498942112288, |
|
"learning_rate": 4.062958581119472e-07, |
|
"loss": 0.9865, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.368763557483731, |
|
"grad_norm": 2.132812386449239, |
|
"learning_rate": 3.9878193570469743e-07, |
|
"loss": 0.9871, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3796095444685466, |
|
"grad_norm": 2.337071820270912, |
|
"learning_rate": 3.91053853880703e-07, |
|
"loss": 0.984, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.39045553145336226, |
|
"grad_norm": 2.1129666341045263, |
|
"learning_rate": 3.831227365811074e-07, |
|
"loss": 0.9943, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.40130151843817785, |
|
"grad_norm": 2.121223339662348, |
|
"learning_rate": 3.75e-07, |
|
"loss": 0.9918, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4121475054229935, |
|
"grad_norm": 2.285212865590998, |
|
"learning_rate": 3.6669733615173965e-07, |
|
"loss": 0.9843, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4229934924078091, |
|
"grad_norm": 2.123810421477716, |
|
"learning_rate": 3.5822669604125684e-07, |
|
"loss": 0.9781, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.43383947939262474, |
|
"grad_norm": 2.1627048249065997, |
|
"learning_rate": 3.4960027246156036e-07, |
|
"loss": 0.9781, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.44468546637744033, |
|
"grad_norm": 2.137516375815561, |
|
"learning_rate": 3.408304824432103e-07, |
|
"loss": 0.9867, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.455531453362256, |
|
"grad_norm": 2.058882247220635, |
|
"learning_rate": 3.319299493810187e-07, |
|
"loss": 0.9776, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.46637744034707157, |
|
"grad_norm": 2.1433762495027864, |
|
"learning_rate": 3.229114848637062e-07, |
|
"loss": 0.9879, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4772234273318872, |
|
"grad_norm": 2.1877299182133734, |
|
"learning_rate": 3.13788070232669e-07, |
|
"loss": 0.9812, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4880694143167028, |
|
"grad_norm": 2.0613909390774525, |
|
"learning_rate": 3.0457283789640036e-07, |
|
"loss": 1.0005, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.49891540130151846, |
|
"grad_norm": 2.137287960677187, |
|
"learning_rate": 2.9527905242746395e-07, |
|
"loss": 0.9994, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5097613882863341, |
|
"grad_norm": 2.062311669373047, |
|
"learning_rate": 2.85920091469227e-07, |
|
"loss": 0.9806, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5206073752711496, |
|
"grad_norm": 2.1009325069210054, |
|
"learning_rate": 2.765094264798387e-07, |
|
"loss": 0.9905, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5314533622559653, |
|
"grad_norm": 2.252996820494139, |
|
"learning_rate": 2.6706060334116775e-07, |
|
"loss": 0.9833, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5422993492407809, |
|
"grad_norm": 2.063111058570344, |
|
"learning_rate": 2.575872228606156e-07, |
|
"loss": 0.9466, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5531453362255966, |
|
"grad_norm": 2.1028294210091794, |
|
"learning_rate": 2.4810292119386674e-07, |
|
"loss": 0.9556, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5639913232104121, |
|
"grad_norm": 2.121237281068362, |
|
"learning_rate": 2.3862135021675915e-07, |
|
"loss": 0.9717, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5748373101952278, |
|
"grad_norm": 2.2205226946736905, |
|
"learning_rate": 2.2915615787452664e-07, |
|
"loss": 0.9638, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5856832971800434, |
|
"grad_norm": 2.0940361008010333, |
|
"learning_rate": 2.1972096853669903e-07, |
|
"loss": 0.9671, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.596529284164859, |
|
"grad_norm": 2.0586787374748576, |
|
"learning_rate": 2.1032936338593717e-07, |
|
"loss": 0.9773, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6073752711496746, |
|
"grad_norm": 2.072790057342988, |
|
"learning_rate": 2.0099486086903294e-07, |
|
"loss": 0.9684, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6182212581344902, |
|
"grad_norm": 2.1778824831003147, |
|
"learning_rate": 1.9173089723821087e-07, |
|
"loss": 0.9667, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6290672451193059, |
|
"grad_norm": 2.115203742538656, |
|
"learning_rate": 1.825508072107439e-07, |
|
"loss": 0.9919, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6399132321041214, |
|
"grad_norm": 2.9734190506035536, |
|
"learning_rate": 1.7346780477471897e-07, |
|
"loss": 0.9741, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6507592190889371, |
|
"grad_norm": 2.109405285224148, |
|
"learning_rate": 1.6449496416858282e-07, |
|
"loss": 0.9815, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6616052060737527, |
|
"grad_norm": 2.0612586919012355, |
|
"learning_rate": 1.5564520106184643e-07, |
|
"loss": 0.9637, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6724511930585684, |
|
"grad_norm": 2.126282772097656, |
|
"learning_rate": 1.4693125396403562e-07, |
|
"loss": 0.9576, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6832971800433839, |
|
"grad_norm": 2.151977422363772, |
|
"learning_rate": 1.3836566588865e-07, |
|
"loss": 0.9871, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6941431670281996, |
|
"grad_norm": 2.10699457923422, |
|
"learning_rate": 1.2996076629852112e-07, |
|
"loss": 0.9775, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7049891540130152, |
|
"grad_norm": 2.2306202203282366, |
|
"learning_rate": 1.2172865335856064e-07, |
|
"loss": 0.9869, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7158351409978309, |
|
"grad_norm": 2.1309989319293696, |
|
"learning_rate": 1.1368117652144185e-07, |
|
"loss": 0.9871, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7266811279826464, |
|
"grad_norm": 2.1871677586329175, |
|
"learning_rate": 1.0582991947128323e-07, |
|
"loss": 0.983, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.737527114967462, |
|
"grad_norm": 2.089837928751529, |
|
"learning_rate": 9.818618344988258e-08, |
|
"loss": 0.9817, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7483731019522777, |
|
"grad_norm": 2.0529724920050905, |
|
"learning_rate": 9.076097098950541e-08, |
|
"loss": 0.9581, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7592190889370932, |
|
"grad_norm": 2.077217246150324, |
|
"learning_rate": 8.356497007563986e-08, |
|
"loss": 0.9845, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7700650759219089, |
|
"grad_norm": 2.1176827367409876, |
|
"learning_rate": 7.660853876251683e-08, |
|
"loss": 0.9625, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7809110629067245, |
|
"grad_norm": 2.1409566690457433, |
|
"learning_rate": 6.990169026353867e-08, |
|
"loss": 0.9999, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7917570498915402, |
|
"grad_norm": 2.0254717410055703, |
|
"learning_rate": 6.345407853807863e-08, |
|
"loss": 0.9642, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8026030368763557, |
|
"grad_norm": 2.140899328755212, |
|
"learning_rate": 5.727498439539602e-08, |
|
"loss": 0.9717, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8134490238611713, |
|
"grad_norm": 2.0666287819389404, |
|
"learning_rate": 5.13733021356714e-08, |
|
"loss": 0.9697, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.824295010845987, |
|
"grad_norm": 2.0741290971837127, |
|
"learning_rate": 4.57575267473895e-08, |
|
"loss": 0.9641, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8351409978308026, |
|
"grad_norm": 2.108990519804528, |
|
"learning_rate": 4.043574167949892e-08, |
|
"loss": 0.9902, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8459869848156182, |
|
"grad_norm": 2.094620559269544, |
|
"learning_rate": 3.541560720594869e-08, |
|
"loss": 0.9926, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8568329718004338, |
|
"grad_norm": 2.0594257251140586, |
|
"learning_rate": 3.0704349399351435e-08, |
|
"loss": 0.9614, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8676789587852495, |
|
"grad_norm": 2.1291510379356753, |
|
"learning_rate": 2.6308749729643058e-08, |
|
"loss": 0.9651, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8785249457700651, |
|
"grad_norm": 2.1035620640567734, |
|
"learning_rate": 2.2235135302712092e-08, |
|
"loss": 0.9952, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8893709327548807, |
|
"grad_norm": 2.0424482935650614, |
|
"learning_rate": 1.8489369753048682e-08, |
|
"loss": 0.9631, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9002169197396963, |
|
"grad_norm": 2.0939056533766847, |
|
"learning_rate": 1.507684480352292e-08, |
|
"loss": 0.9747, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.911062906724512, |
|
"grad_norm": 4.429046210547233, |
|
"learning_rate": 1.2002472504440807e-08, |
|
"loss": 0.9843, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9219088937093276, |
|
"grad_norm": 2.1047314550346297, |
|
"learning_rate": 9.270678163050217e-09, |
|
"loss": 0.9738, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9327548806941431, |
|
"grad_norm": 2.0998363541587888, |
|
"learning_rate": 6.885393973673298e-09, |
|
"loss": 0.9509, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9436008676789588, |
|
"grad_norm": 2.138035512894866, |
|
"learning_rate": 4.850053357634693e-09, |
|
"loss": 0.9799, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9544468546637744, |
|
"grad_norm": 2.053994471901985, |
|
"learning_rate": 3.1675860211325954e-09, |
|
"loss": 0.9762, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.96529284164859, |
|
"grad_norm": 2.054984790757136, |
|
"learning_rate": 1.840413738166402e-09, |
|
"loss": 0.9499, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9761388286334056, |
|
"grad_norm": 2.0710147404144097, |
|
"learning_rate": 8.704468645914787e-10, |
|
"loss": 0.9692, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9869848156182213, |
|
"grad_norm": 2.067619347023652, |
|
"learning_rate": 2.5908158831811077e-10, |
|
"loss": 0.9994, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9978308026030369, |
|
"grad_norm": 2.095822455094207, |
|
"learning_rate": 7.197919613455284e-12, |
|
"loss": 0.9524, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0288481712341309, |
|
"eval_runtime": 3.4782, |
|
"eval_samples_per_second": 74.465, |
|
"eval_steps_per_second": 1.438, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 461, |
|
"total_flos": 192943352709120.0, |
|
"train_loss": 1.0081440616326323, |
|
"train_runtime": 5423.7345, |
|
"train_samples_per_second": 21.752, |
|
"train_steps_per_second": 0.085 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 461, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 192943352709120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|