|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1001, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000999000999000999, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.9801980198019803e-06, |
|
"loss": 1.9138, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004995004995004995, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.900990099009901e-06, |
|
"loss": 1.95, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00999000999000999, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 1.9142, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014985014985014986, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.9702970297029702e-05, |
|
"loss": 1.8437, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01998001998001998, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 1.7806, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.024975024975024976, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 1.645, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.029970029970029972, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.9405940594059404e-05, |
|
"loss": 1.5936, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03496503496503497, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.93069306930693e-05, |
|
"loss": 1.5451, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03996003996003996, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 1.5002, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04495504495504495, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 8.910891089108912e-05, |
|
"loss": 1.4846, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04995004995004995, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.4525, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.00010891089108910893, |
|
"loss": 1.4746, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.059940059940059943, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.439, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.00012871287128712872, |
|
"loss": 1.4421, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06993006993006994, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 0.0001386138613861386, |
|
"loss": 1.4223, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07492507492507493, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.4025, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07992007992007992, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 0.00015841584158415842, |
|
"loss": 1.4102, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08491508491508491, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00016831683168316833, |
|
"loss": 1.3908, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0899100899100899, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00017821782178217824, |
|
"loss": 1.4007, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09490509490509491, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00018811881188118812, |
|
"loss": 1.3824, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0999000999000999, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 1.3897, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1048951048951049, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 0.00019999025240093044, |
|
"loss": 1.3614, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.00019995065603657316, |
|
"loss": 1.3792, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11488511488511488, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.0001998806137341434, |
|
"loss": 1.37, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11988011988011989, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.000199780146829205, |
|
"loss": 1.3638, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12487512487512488, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00019964928592495045, |
|
"loss": 1.3911, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00019948807088287883, |
|
"loss": 1.3534, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13486513486513488, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.0001992965508106537, |
|
"loss": 1.3619, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 0.00019907478404714436, |
|
"loss": 1.3612, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14485514485514486, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.0001988228381446553, |
|
"loss": 1.3531, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14985014985014986, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.00019854078984834903, |
|
"loss": 1.3589, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15484515484515485, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.3393, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.15984015984015984, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.0001978867388761685, |
|
"loss": 1.3378, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00019751493543055632, |
|
"loss": 1.3435, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.16983016983016982, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 0.00019711342799096361, |
|
"loss": 1.3241, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17482517482517482, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 0.00019668233886044597, |
|
"loss": 1.3394, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1798201798201798, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00019622179935292855, |
|
"loss": 1.3357, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1848151848151848, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00019573194975320673, |
|
"loss": 1.3377, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.18981018981018982, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.00019521293927421388, |
|
"loss": 1.3461, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00019466492601156966, |
|
"loss": 1.3366, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1998001998001998, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 0.00019408807689542257, |
|
"loss": 1.3307, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2047952047952048, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 0.00019348256763960145, |
|
"loss": 1.336, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2097902097902098, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.00019284858268809137, |
|
"loss": 1.3395, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21478521478521478, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00019218631515885006, |
|
"loss": 1.3457, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.0001914959667849825, |
|
"loss": 1.3472, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22477522477522477, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 0.00019077774785329087, |
|
"loss": 1.3418, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22977022977022976, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.00019003187714021938, |
|
"loss": 1.3217, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23476523476523475, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 0.00018925858184521256, |
|
"loss": 1.3256, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.23976023976023977, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.0001884580975215084, |
|
"loss": 1.3417, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24475524475524477, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 1.3261, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.24975024975024976, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.00018677654533689287, |
|
"loss": 1.3252, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2547452547452547, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 0.00018589598969306645, |
|
"loss": 1.3387, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 0.00018498926929868642, |
|
"loss": 1.3577, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2647352647352647, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.00018405666034956844, |
|
"loss": 1.3321, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.26973026973026976, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00018309844692743283, |
|
"loss": 1.3067, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 0.00018211492091337042, |
|
"loss": 1.3489, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00018110638189893267, |
|
"loss": 1.3276, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.28471528471528473, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00018007313709487334, |
|
"loss": 1.3258, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2897102897102897, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00017901550123756906, |
|
"loss": 1.332, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2947052947052947, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.00017793379649314744, |
|
"loss": 1.335, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2997002997002997, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.00017682835235935236, |
|
"loss": 1.3266, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3046953046953047, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 0.00017569950556517566, |
|
"loss": 1.3174, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3096903096903097, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 0.00017454759996828623, |
|
"loss": 1.3212, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3146853146853147, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00017337298645028764, |
|
"loss": 1.326, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3196803196803197, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00017217602280983623, |
|
"loss": 1.3364, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.0001709570736536521, |
|
"loss": 1.3168, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 0.00016971651028545648, |
|
"loss": 1.3166, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.33466533466533466, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 1.3228, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.33966033966033965, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 0.00016717205893229903, |
|
"loss": 1.3277, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.34465534465534464, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.00016586894601186805, |
|
"loss": 1.3197, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.34965034965034963, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00016454576877239507, |
|
"loss": 1.3276, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3546453546453546, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.0001632029302664851, |
|
"loss": 1.309, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3596403596403596, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.0001618408395357554, |
|
"loss": 1.334, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3646353646353646, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.0001604599114862375, |
|
"loss": 1.3287, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3696303696303696, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00015906056676199255, |
|
"loss": 1.3165, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37462537462537465, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00015764323161697935, |
|
"loss": 1.3127, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.37962037962037964, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00015620833778521307, |
|
"loss": 1.3297, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00015475632234925504, |
|
"loss": 1.3184, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.000153287627607073, |
|
"loss": 1.3004, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3946053946053946, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.00015180270093731303, |
|
"loss": 1.3179, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3996003996003996, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00015030199466302353, |
|
"loss": 1.3294, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4045954045954046, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.0001487859659138733, |
|
"loss": 1.3469, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4095904095904096, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.00014725507648690543, |
|
"loss": 1.3166, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4145854145854146, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.00014570979270586945, |
|
"loss": 1.3149, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 0.00014415058527917452, |
|
"loss": 1.3365, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4245754245754246, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 1.3289, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.42957042957042957, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.00014099230338415728, |
|
"loss": 1.2932, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43456543456543456, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 0.00013939419095909512, |
|
"loss": 1.318, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00013778407868184672, |
|
"loss": 1.2892, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.44455544455544455, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00013616245700820922, |
|
"loss": 1.3114, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.44955044955044954, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 0.00013452981989985348, |
|
"loss": 1.3143, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 0.00013288666467385833, |
|
"loss": 1.3231, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4595404595404595, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.00013123349185122327, |
|
"loss": 1.3077, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4645354645354645, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 0.00012957080500440468, |
|
"loss": 1.3247, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4695304695304695, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00012789911060392294, |
|
"loss": 1.3097, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4745254745254745, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00012621891786408648, |
|
"loss": 1.3099, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.47952047952047955, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.00012453073858788026, |
|
"loss": 1.3267, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48451548451548454, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.00012283508701106557, |
|
"loss": 1.3114, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.48951048951048953, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00012113247964553888, |
|
"loss": 1.2904, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.0001194234351219972, |
|
"loss": 1.3151, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4995004995004995, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 0.00011770847403195834, |
|
"loss": 1.3126, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5044955044955045, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.0001159881187691835, |
|
"loss": 1.3108, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5094905094905094, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.00011426289337055119, |
|
"loss": 1.3032, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5144855144855145, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 1.3043, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.0001107999355706023, |
|
"loss": 1.3188, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5244755244755245, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00010906325801977804, |
|
"loss": 1.3266, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5294705294705294, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 0.00010732381971276318, |
|
"loss": 1.3156, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5344655344655345, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00010558215049931638, |
|
"loss": 1.304, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5394605394605395, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.00010383878090875201, |
|
"loss": 1.3089, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5444555444555444, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.0001020942419883357, |
|
"loss": 1.3298, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00010034906514152238, |
|
"loss": 1.3127, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5544455544455544, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 9.860378196608549e-05, |
|
"loss": 1.3248, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 9.685892409218717e-05, |
|
"loss": 1.3116, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5644355644355644, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 9.511502302043868e-05, |
|
"loss": 1.3196, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5694305694305695, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 9.337260996000002e-05, |
|
"loss": 1.3183, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5744255744255744, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 9.163221566676847e-05, |
|
"loss": 1.3165, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5794205794205795, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 8.989437028170537e-05, |
|
"loss": 1.3125, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 8.81596031693499e-05, |
|
"loss": 1.325, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5894105894105894, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 8.642844275656957e-05, |
|
"loss": 1.3308, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5944055944055944, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 8.47014163715962e-05, |
|
"loss": 1.3099, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5994005994005994, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 8.297905008339677e-05, |
|
"loss": 1.2791, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 1.3162, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6093906093906094, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 7.955039481582097e-05, |
|
"loss": 1.3093, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6143856143856143, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 7.784515023805328e-05, |
|
"loss": 1.309, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6193806193806194, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 7.614665424214193e-05, |
|
"loss": 1.285, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6243756243756243, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 7.445542420642097e-05, |
|
"loss": 1.3155, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6293706293706294, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 7.277197529594257e-05, |
|
"loss": 1.301, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6343656343656343, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 7.109682030555283e-05, |
|
"loss": 1.295, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6393606393606394, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 6.943046950368944e-05, |
|
"loss": 1.315, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6443556443556444, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 6.77734304769489e-05, |
|
"loss": 1.2878, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 6.612620797547087e-05, |
|
"loss": 1.3135, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6543456543456544, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 6.448930375918631e-05, |
|
"loss": 1.2958, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 6.286321644497655e-05, |
|
"loss": 1.3286, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6643356643356644, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 6.12484413547897e-05, |
|
"loss": 1.2988, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6693306693306693, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 5.964547036476099e-05, |
|
"loss": 1.3086, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6743256743256744, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 5.805479175538229e-05, |
|
"loss": 1.3056, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6793206793206793, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 5.647689006276726e-05, |
|
"loss": 1.3259, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6843156843156843, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 5.491224593105695e-05, |
|
"loss": 1.3105, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6893106893106893, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 5.33613359660109e-05, |
|
"loss": 1.3113, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6943056943056943, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 1.3144, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 5.0302603897244474e-05, |
|
"loss": 1.3059, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7042957042957043, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 4.8795713512942865e-05, |
|
"loss": 1.2977, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7092907092907093, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 4.7304420450332244e-05, |
|
"loss": 1.3051, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 4.582917897172603e-05, |
|
"loss": 1.3229, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7192807192807192, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 4.437043844996952e-05, |
|
"loss": 1.3063, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7242757242757243, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 4.2928643231556844e-05, |
|
"loss": 1.2956, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7292707292707292, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 4.150423250127845e-05, |
|
"loss": 1.3036, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7342657342657343, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 4.009764014844143e-05, |
|
"loss": 1.3252, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7392607392607392, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 3.8709294634702376e-05, |
|
"loss": 1.3207, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7442557442557443, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 3.733961886355398e-05, |
|
"loss": 1.2915, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7492507492507493, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 3.5989030051504434e-05, |
|
"loss": 1.3117, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7542457542457542, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 3.465793960098945e-05, |
|
"loss": 1.3022, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7592407592407593, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 3.334675297505476e-05, |
|
"loss": 1.3298, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7642357642357642, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 3.205586957384838e-05, |
|
"loss": 1.3085, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 3.078568261295933e-05, |
|
"loss": 1.2917, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7742257742257742, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 2.953657900364053e-05, |
|
"loss": 1.3111, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 2.8308939234951726e-05, |
|
"loss": 1.3108, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7842157842157842, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 1.3106, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7892107892107892, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 2.5919540371325e-05, |
|
"loss": 1.292, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7942057942057942, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 2.4758509110427575e-05, |
|
"loss": 1.3089, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7992007992007992, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 2.362039713653581e-05, |
|
"loss": 1.3201, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8041958041958042, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 2.2505551129582047e-05, |
|
"loss": 1.2985, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8091908091908092, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 2.1414310682459802e-05, |
|
"loss": 1.309, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8141858141858141, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 2.0347008197580374e-05, |
|
"loss": 1.3097, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8191808191808192, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 1.930396878561983e-05, |
|
"loss": 1.3215, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 1.8285510166487152e-05, |
|
"loss": 1.3002, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8291708291708292, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 1.7291942572543807e-05, |
|
"loss": 1.3233, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8341658341658341, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 1.632356865410384e-05, |
|
"loss": 1.3163, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 1.538068338724361e-05, |
|
"loss": 1.2986, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 1.4463573983949341e-05, |
|
"loss": 1.319, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8491508491508492, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 1.3572519804629536e-05, |
|
"loss": 1.3119, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8541458541458542, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 1.2707792273019048e-05, |
|
"loss": 1.3047, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8591408591408591, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 1.1869654793500784e-05, |
|
"loss": 1.3001, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8641358641358642, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 1.1058362670870249e-05, |
|
"loss": 1.3192, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8691308691308691, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 1.0274163032567163e-05, |
|
"loss": 1.3049, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8741258741258742, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 1.3049, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 8.787988382772705e-06, |
|
"loss": 1.3111, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8841158841158842, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 8.086466074476563e-06, |
|
"loss": 1.2984, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8891108891108891, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 7.412941519000527e-06, |
|
"loss": 1.3023, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8941058941058941, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 6.767619878448783e-06, |
|
"loss": 1.3173, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8991008991008991, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 6.1506977240444074e-06, |
|
"loss": 1.3053, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9040959040959041, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 5.562362976251901e-06, |
|
"loss": 1.312, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 5.002794847534764e-06, |
|
"loss": 1.2995, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9140859140859141, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 4.4721637877656375e-06, |
|
"loss": 1.332, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.919080919080919, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 3.970631432305694e-06, |
|
"loss": 1.3225, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9240759240759241, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 3.4983505527688586e-06, |
|
"loss": 1.2955, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.929070929070929, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 3.0554650104861136e-06, |
|
"loss": 1.3007, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 2.6421097126839712e-06, |
|
"loss": 1.3259, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.939060939060939, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 2.2584105713904125e-06, |
|
"loss": 1.3091, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9440559440559441, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.904484465080847e-06, |
|
"loss": 1.3195, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.949050949050949, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 1.580439203075812e-06, |
|
"loss": 1.3043, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.954045954045954, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 1.2863734927012095e-06, |
|
"loss": 1.3167, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9590409590409591, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 1.0223769092211012e-06, |
|
"loss": 1.3064, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.964035964035964, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 1.3197, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9690309690309691, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 5.849036027684606e-07, |
|
"loss": 1.3074, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 4.115601384029666e-07, |
|
"loss": 1.3183, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 2.685522775541904e-07, |
|
"loss": 1.3165, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.984015984015984, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 1.5592358180189782e-07, |
|
"loss": 1.3114, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 7.370835893788508e-08, |
|
"loss": 1.2917, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.994005994005994, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 2.193165251545004e-08, |
|
"loss": 1.3122, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.999000999000999, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 6.092342209607083e-10, |
|
"loss": 1.3184, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3126407861709595, |
|
"eval_runtime": 1177.4856, |
|
"eval_samples_per_second": 12.041, |
|
"eval_steps_per_second": 12.041, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1001, |
|
"total_flos": 1.709524252278915e+18, |
|
"train_loss": 0.07992646839473393, |
|
"train_runtime": 3675.5245, |
|
"train_samples_per_second": 34.86, |
|
"train_steps_per_second": 0.272 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1001, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 1.709524252278915e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|