|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.352, |
|
"eval_steps": 500, |
|
"global_step": 4400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004, |
|
"grad_norm": 4.773154258728027, |
|
"learning_rate": 4.9999997806754614e-05, |
|
"loss": 2.4535, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0008, |
|
"grad_norm": 5.585668563842773, |
|
"learning_rate": 4.999999122701883e-05, |
|
"loss": 1.9042, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0012, |
|
"grad_norm": 2.7855770587921143, |
|
"learning_rate": 4.99999802607938e-05, |
|
"loss": 1.6398, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 2.4066975116729736, |
|
"learning_rate": 4.9999964908081455e-05, |
|
"loss": 1.3978, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002, |
|
"grad_norm": 1.6574153900146484, |
|
"learning_rate": 4.999994516888449e-05, |
|
"loss": 1.3416, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0024, |
|
"grad_norm": 1.886969804763794, |
|
"learning_rate": 4.999992104320636e-05, |
|
"loss": 1.4384, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0028, |
|
"grad_norm": 1.656772494316101, |
|
"learning_rate": 4.99998925310513e-05, |
|
"loss": 1.3529, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 1.6747465133666992, |
|
"learning_rate": 4.999985963242432e-05, |
|
"loss": 1.3142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0036, |
|
"grad_norm": 1.633796215057373, |
|
"learning_rate": 4.999982234733118e-05, |
|
"loss": 1.3162, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 1.5636812448501587, |
|
"learning_rate": 4.999978067577844e-05, |
|
"loss": 1.3443, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0044, |
|
"grad_norm": 1.6511592864990234, |
|
"learning_rate": 4.9999734617773405e-05, |
|
"loss": 1.2582, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 1.6226695775985718, |
|
"learning_rate": 4.999968417332415e-05, |
|
"loss": 1.3449, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0052, |
|
"grad_norm": 1.502573847770691, |
|
"learning_rate": 4.9999629342439524e-05, |
|
"loss": 1.2836, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0056, |
|
"grad_norm": 1.5057326555252075, |
|
"learning_rate": 4.999957012512916e-05, |
|
"loss": 1.2437, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006, |
|
"grad_norm": 1.627768874168396, |
|
"learning_rate": 4.999950652140343e-05, |
|
"loss": 1.2318, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 1.7765494585037231, |
|
"learning_rate": 4.999943853127351e-05, |
|
"loss": 1.3253, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0068, |
|
"grad_norm": 1.7718794345855713, |
|
"learning_rate": 4.999936615475133e-05, |
|
"loss": 1.3195, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0072, |
|
"grad_norm": 1.2835720777511597, |
|
"learning_rate": 4.999928939184958e-05, |
|
"loss": 1.2973, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0076, |
|
"grad_norm": 1.77462637424469, |
|
"learning_rate": 4.999920824258173e-05, |
|
"loss": 1.2883, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.3049944639205933, |
|
"learning_rate": 4.999912270696202e-05, |
|
"loss": 1.346, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0084, |
|
"grad_norm": 1.7110176086425781, |
|
"learning_rate": 4.9999032785005464e-05, |
|
"loss": 1.3119, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0088, |
|
"grad_norm": 1.6607234477996826, |
|
"learning_rate": 4.9998938476727826e-05, |
|
"loss": 1.2937, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0092, |
|
"grad_norm": 1.6394551992416382, |
|
"learning_rate": 4.999883978214567e-05, |
|
"loss": 1.2376, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 1.8243571519851685, |
|
"learning_rate": 4.9998736701276295e-05, |
|
"loss": 1.3009, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9557856321334839, |
|
"learning_rate": 4.999862923413781e-05, |
|
"loss": 1.2827, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0104, |
|
"grad_norm": 1.4470913410186768, |
|
"learning_rate": 4.999851738074904e-05, |
|
"loss": 1.3158, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0108, |
|
"grad_norm": 1.4409664869308472, |
|
"learning_rate": 4.999840114112965e-05, |
|
"loss": 1.2554, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0112, |
|
"grad_norm": 1.5476871728897095, |
|
"learning_rate": 4.99982805153e-05, |
|
"loss": 1.2788, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0116, |
|
"grad_norm": 1.5238358974456787, |
|
"learning_rate": 4.999815550328128e-05, |
|
"loss": 1.22, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 1.388649344444275, |
|
"learning_rate": 4.9998026105095405e-05, |
|
"loss": 1.2836, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0124, |
|
"grad_norm": 1.6821857690811157, |
|
"learning_rate": 4.999789232076509e-05, |
|
"loss": 1.2498, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.7721805572509766, |
|
"learning_rate": 4.9997754150313815e-05, |
|
"loss": 1.2757, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0132, |
|
"grad_norm": 1.5760489702224731, |
|
"learning_rate": 4.99976115937658e-05, |
|
"loss": 1.2378, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0136, |
|
"grad_norm": 1.7729449272155762, |
|
"learning_rate": 4.999746465114609e-05, |
|
"loss": 1.2126, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.014, |
|
"grad_norm": 1.813071370124817, |
|
"learning_rate": 4.999731332248044e-05, |
|
"loss": 1.2379, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0144, |
|
"grad_norm": 1.7245049476623535, |
|
"learning_rate": 4.999715760779541e-05, |
|
"loss": 1.2449, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0148, |
|
"grad_norm": 1.8477286100387573, |
|
"learning_rate": 4.999699750711833e-05, |
|
"loss": 1.2934, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0152, |
|
"grad_norm": 1.4089431762695312, |
|
"learning_rate": 4.9996833020477285e-05, |
|
"loss": 1.2313, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0156, |
|
"grad_norm": 2.9127607345581055, |
|
"learning_rate": 4.999666414790113e-05, |
|
"loss": 1.2328, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.9049687385559082, |
|
"learning_rate": 4.9996490889419514e-05, |
|
"loss": 1.2497, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0164, |
|
"grad_norm": 2.1356122493743896, |
|
"learning_rate": 4.9996313245062823e-05, |
|
"loss": 1.2877, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0168, |
|
"grad_norm": 1.6691316366195679, |
|
"learning_rate": 4.999613121486222e-05, |
|
"loss": 1.2884, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0172, |
|
"grad_norm": 1.8735146522521973, |
|
"learning_rate": 4.999594479884965e-05, |
|
"loss": 1.2742, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.0176, |
|
"grad_norm": 1.6481894254684448, |
|
"learning_rate": 4.999575399705783e-05, |
|
"loss": 1.2408, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.018, |
|
"grad_norm": 1.7049623727798462, |
|
"learning_rate": 4.999555880952023e-05, |
|
"loss": 1.1997, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0184, |
|
"grad_norm": 1.9054642915725708, |
|
"learning_rate": 4.999535923627109e-05, |
|
"loss": 1.2414, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0188, |
|
"grad_norm": 1.798621416091919, |
|
"learning_rate": 4.999515527734545e-05, |
|
"loss": 1.2848, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.767776608467102, |
|
"learning_rate": 4.999494693277907e-05, |
|
"loss": 1.2626, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0196, |
|
"grad_norm": 2.2002179622650146, |
|
"learning_rate": 4.999473420260853e-05, |
|
"loss": 1.2582, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9857462644577026, |
|
"learning_rate": 4.999451708687114e-05, |
|
"loss": 1.2702, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0204, |
|
"grad_norm": 1.5707662105560303, |
|
"learning_rate": 4.9994295585605e-05, |
|
"loss": 1.1929, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.0208, |
|
"grad_norm": 1.5665488243103027, |
|
"learning_rate": 4.999406969884897e-05, |
|
"loss": 1.2231, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0212, |
|
"grad_norm": 1.508448600769043, |
|
"learning_rate": 4.9993839426642685e-05, |
|
"loss": 1.2621, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0216, |
|
"grad_norm": 1.6147902011871338, |
|
"learning_rate": 4.999360476902656e-05, |
|
"loss": 1.2124, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.022, |
|
"grad_norm": 1.9145556688308716, |
|
"learning_rate": 4.999336572604175e-05, |
|
"loss": 1.278, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 1.7562631368637085, |
|
"learning_rate": 4.999312229773022e-05, |
|
"loss": 1.2608, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0228, |
|
"grad_norm": 1.7679455280303955, |
|
"learning_rate": 4.9992874484134653e-05, |
|
"loss": 1.2849, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.0232, |
|
"grad_norm": 1.5522981882095337, |
|
"learning_rate": 4.999262228529855e-05, |
|
"loss": 1.2251, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0236, |
|
"grad_norm": 1.8789103031158447, |
|
"learning_rate": 4.999236570126616e-05, |
|
"loss": 1.2721, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.6382054090499878, |
|
"learning_rate": 4.99921047320825e-05, |
|
"loss": 1.2494, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0244, |
|
"grad_norm": 1.7209609746932983, |
|
"learning_rate": 4.999183937779336e-05, |
|
"loss": 1.214, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.0248, |
|
"grad_norm": 2.084233283996582, |
|
"learning_rate": 4.99915696384453e-05, |
|
"loss": 1.2649, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0252, |
|
"grad_norm": 1.9279961585998535, |
|
"learning_rate": 4.9991295514085644e-05, |
|
"loss": 1.1784, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 2.1031174659729004, |
|
"learning_rate": 4.9991017004762496e-05, |
|
"loss": 1.2838, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.026, |
|
"grad_norm": 1.5954639911651611, |
|
"learning_rate": 4.999073411052472e-05, |
|
"loss": 1.3073, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0264, |
|
"grad_norm": 1.612317681312561, |
|
"learning_rate": 4.9990446831421955e-05, |
|
"loss": 1.2914, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0268, |
|
"grad_norm": 1.5913759469985962, |
|
"learning_rate": 4.99901551675046e-05, |
|
"loss": 1.3132, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.0272, |
|
"grad_norm": 1.8079237937927246, |
|
"learning_rate": 4.998985911882384e-05, |
|
"loss": 1.1934, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0276, |
|
"grad_norm": 1.9866459369659424, |
|
"learning_rate": 4.998955868543161e-05, |
|
"loss": 1.2486, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 1.791840672492981, |
|
"learning_rate": 4.998925386738063e-05, |
|
"loss": 1.2049, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0284, |
|
"grad_norm": 1.7033882141113281, |
|
"learning_rate": 4.998894466472438e-05, |
|
"loss": 1.2323, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 1.525989294052124, |
|
"learning_rate": 4.998863107751711e-05, |
|
"loss": 1.2439, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0292, |
|
"grad_norm": 1.5799592733383179, |
|
"learning_rate": 4.9988313105813856e-05, |
|
"loss": 1.2807, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.0296, |
|
"grad_norm": 1.7438101768493652, |
|
"learning_rate": 4.9987990749670395e-05, |
|
"loss": 1.2605, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.538445234298706, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 1.2185, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.0304, |
|
"grad_norm": 1.7448121309280396, |
|
"learning_rate": 4.998733288428987e-05, |
|
"loss": 1.2511, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0308, |
|
"grad_norm": 1.7485535144805908, |
|
"learning_rate": 4.9986997375168246e-05, |
|
"loss": 1.2266, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.0312, |
|
"grad_norm": 1.970339059829712, |
|
"learning_rate": 4.9986657481837277e-05, |
|
"loss": 1.2363, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0316, |
|
"grad_norm": 1.883518934249878, |
|
"learning_rate": 4.9986313204356594e-05, |
|
"loss": 1.2034, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.6562483310699463, |
|
"learning_rate": 4.9985964542786614e-05, |
|
"loss": 1.2578, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0324, |
|
"grad_norm": 2.5307445526123047, |
|
"learning_rate": 4.99856114971885e-05, |
|
"loss": 1.2506, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.0328, |
|
"grad_norm": 1.5744280815124512, |
|
"learning_rate": 4.9985254067624215e-05, |
|
"loss": 1.3237, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.0332, |
|
"grad_norm": 1.7964937686920166, |
|
"learning_rate": 4.9984892254156465e-05, |
|
"loss": 1.2538, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.0336, |
|
"grad_norm": 1.790570855140686, |
|
"learning_rate": 4.998452605684874e-05, |
|
"loss": 1.3154, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.034, |
|
"grad_norm": 1.8018208742141724, |
|
"learning_rate": 4.998415547576527e-05, |
|
"loss": 1.2145, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.0344, |
|
"grad_norm": 1.8359650373458862, |
|
"learning_rate": 4.998378051097111e-05, |
|
"loss": 1.2926, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0348, |
|
"grad_norm": 1.9998548030853271, |
|
"learning_rate": 4.9983401162532025e-05, |
|
"loss": 1.2439, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 1.9759939908981323, |
|
"learning_rate": 4.998301743051459e-05, |
|
"loss": 1.2516, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0356, |
|
"grad_norm": 1.7635678052902222, |
|
"learning_rate": 4.9982629314986126e-05, |
|
"loss": 1.2549, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 1.8558741807937622, |
|
"learning_rate": 4.998223681601473e-05, |
|
"loss": 1.278, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0364, |
|
"grad_norm": 1.6500812768936157, |
|
"learning_rate": 4.998183993366928e-05, |
|
"loss": 1.2406, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.0368, |
|
"grad_norm": 2.186829090118408, |
|
"learning_rate": 4.998143866801942e-05, |
|
"loss": 1.2514, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0372, |
|
"grad_norm": 1.806112289428711, |
|
"learning_rate": 4.998103301913552e-05, |
|
"loss": 1.2641, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.0376, |
|
"grad_norm": 1.7315857410430908, |
|
"learning_rate": 4.9980622987088795e-05, |
|
"loss": 1.2743, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.038, |
|
"grad_norm": 1.6048853397369385, |
|
"learning_rate": 4.998020857195117e-05, |
|
"loss": 1.2765, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 1.7586766481399536, |
|
"learning_rate": 4.997978977379536e-05, |
|
"loss": 1.2237, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0388, |
|
"grad_norm": 1.8284968137741089, |
|
"learning_rate": 4.997936659269486e-05, |
|
"loss": 1.2165, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.0392, |
|
"grad_norm": 1.761220097541809, |
|
"learning_rate": 4.9978939028723894e-05, |
|
"loss": 1.2629, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0396, |
|
"grad_norm": 1.9978684186935425, |
|
"learning_rate": 4.9978507081957494e-05, |
|
"loss": 1.2974, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.830068588256836, |
|
"learning_rate": 4.997807075247146e-05, |
|
"loss": 1.2658, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0404, |
|
"grad_norm": 1.7393221855163574, |
|
"learning_rate": 4.9977630040342346e-05, |
|
"loss": 1.23, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.0408, |
|
"grad_norm": 1.7979861497879028, |
|
"learning_rate": 4.9977184945647473e-05, |
|
"loss": 1.266, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0412, |
|
"grad_norm": 1.6707987785339355, |
|
"learning_rate": 4.9976735468464935e-05, |
|
"loss": 1.2253, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 1.7217047214508057, |
|
"learning_rate": 4.997628160887361e-05, |
|
"loss": 1.2137, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.042, |
|
"grad_norm": 1.889464259147644, |
|
"learning_rate": 4.9975823366953124e-05, |
|
"loss": 1.2829, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.0424, |
|
"grad_norm": 1.7830469608306885, |
|
"learning_rate": 4.997536074278387e-05, |
|
"loss": 1.2531, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.0428, |
|
"grad_norm": 1.875157356262207, |
|
"learning_rate": 4.9974893736447045e-05, |
|
"loss": 1.2991, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.0432, |
|
"grad_norm": 1.7998117208480835, |
|
"learning_rate": 4.997442234802456e-05, |
|
"loss": 1.3188, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.0436, |
|
"grad_norm": 1.8334897756576538, |
|
"learning_rate": 4.997394657759915e-05, |
|
"loss": 1.2815, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 1.8215042352676392, |
|
"learning_rate": 4.9973466425254286e-05, |
|
"loss": 1.2606, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.0444, |
|
"grad_norm": 1.7977797985076904, |
|
"learning_rate": 4.997298189107421e-05, |
|
"loss": 1.2369, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 1.560219407081604, |
|
"learning_rate": 4.997249297514394e-05, |
|
"loss": 1.2351, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0452, |
|
"grad_norm": 1.5830539464950562, |
|
"learning_rate": 4.997199967754925e-05, |
|
"loss": 1.221, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.0456, |
|
"grad_norm": 1.8532063961029053, |
|
"learning_rate": 4.997150199837671e-05, |
|
"loss": 1.1822, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.046, |
|
"grad_norm": 2.081393003463745, |
|
"learning_rate": 4.997099993771365e-05, |
|
"loss": 1.2456, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.0464, |
|
"grad_norm": 1.8640109300613403, |
|
"learning_rate": 4.997049349564814e-05, |
|
"loss": 1.2186, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0468, |
|
"grad_norm": 1.730169415473938, |
|
"learning_rate": 4.996998267226905e-05, |
|
"loss": 1.2369, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.0472, |
|
"grad_norm": 1.847095251083374, |
|
"learning_rate": 4.996946746766601e-05, |
|
"loss": 1.2597, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.0476, |
|
"grad_norm": 1.756137490272522, |
|
"learning_rate": 4.9968947881929414e-05, |
|
"loss": 1.1965, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.890035629272461, |
|
"learning_rate": 4.996842391515044e-05, |
|
"loss": 1.2345, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0484, |
|
"grad_norm": 1.7195771932601929, |
|
"learning_rate": 4.996789556742101e-05, |
|
"loss": 1.248, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.0488, |
|
"grad_norm": 1.7649211883544922, |
|
"learning_rate": 4.996736283883382e-05, |
|
"loss": 1.2504, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.0492, |
|
"grad_norm": 1.7328550815582275, |
|
"learning_rate": 4.9966825729482364e-05, |
|
"loss": 1.3122, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.0496, |
|
"grad_norm": 1.6901662349700928, |
|
"learning_rate": 4.996628423946087e-05, |
|
"loss": 1.2508, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7033767700195312, |
|
"learning_rate": 4.996573836886435e-05, |
|
"loss": 1.2145, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.0504, |
|
"grad_norm": 1.4727296829223633, |
|
"learning_rate": 4.996518811778858e-05, |
|
"loss": 1.2779, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0508, |
|
"grad_norm": 1.6590018272399902, |
|
"learning_rate": 4.9964633486330116e-05, |
|
"loss": 1.2722, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 2.6924021244049072, |
|
"learning_rate": 4.996407447458626e-05, |
|
"loss": 1.2182, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.0516, |
|
"grad_norm": 1.8809802532196045, |
|
"learning_rate": 4.99635110826551e-05, |
|
"loss": 1.2368, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 1.8109803199768066, |
|
"learning_rate": 4.99629433106355e-05, |
|
"loss": 1.2447, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0524, |
|
"grad_norm": 1.6945043802261353, |
|
"learning_rate": 4.996237115862706e-05, |
|
"loss": 1.1895, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.0528, |
|
"grad_norm": 1.5174620151519775, |
|
"learning_rate": 4.99617946267302e-05, |
|
"loss": 1.2276, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.0532, |
|
"grad_norm": 1.7584028244018555, |
|
"learning_rate": 4.9961213715046045e-05, |
|
"loss": 1.2308, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.0536, |
|
"grad_norm": 2.0159788131713867, |
|
"learning_rate": 4.996062842367654e-05, |
|
"loss": 1.2044, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.054, |
|
"grad_norm": 1.7064976692199707, |
|
"learning_rate": 4.996003875272438e-05, |
|
"loss": 1.2071, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 2.149348258972168, |
|
"learning_rate": 4.995944470229302e-05, |
|
"loss": 1.2218, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.0548, |
|
"grad_norm": 1.5453240871429443, |
|
"learning_rate": 4.9958846272486704e-05, |
|
"loss": 1.2287, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.0552, |
|
"grad_norm": 1.5038634538650513, |
|
"learning_rate": 4.9958243463410414e-05, |
|
"loss": 1.2153, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.0556, |
|
"grad_norm": 1.830925464630127, |
|
"learning_rate": 4.995763627516994e-05, |
|
"loss": 1.2849, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 1.650992512702942, |
|
"learning_rate": 4.9957024707871806e-05, |
|
"loss": 1.1965, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0564, |
|
"grad_norm": 1.9642475843429565, |
|
"learning_rate": 4.995640876162332e-05, |
|
"loss": 1.2489, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.0568, |
|
"grad_norm": 1.6701452732086182, |
|
"learning_rate": 4.9955788436532545e-05, |
|
"loss": 1.1833, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.0572, |
|
"grad_norm": 1.8591731786727905, |
|
"learning_rate": 4.9955163732708346e-05, |
|
"loss": 1.2699, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 1.772956371307373, |
|
"learning_rate": 4.995453465026032e-05, |
|
"loss": 1.2582, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.058, |
|
"grad_norm": 1.8155946731567383, |
|
"learning_rate": 4.9953901189298845e-05, |
|
"loss": 1.2256, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.0584, |
|
"grad_norm": 1.7217077016830444, |
|
"learning_rate": 4.9953263349935074e-05, |
|
"loss": 1.1909, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.0588, |
|
"grad_norm": 1.8732049465179443, |
|
"learning_rate": 4.995262113228091e-05, |
|
"loss": 1.2023, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.0592, |
|
"grad_norm": 1.96453857421875, |
|
"learning_rate": 4.995197453644905e-05, |
|
"loss": 1.2066, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.0596, |
|
"grad_norm": 1.7250394821166992, |
|
"learning_rate": 4.9951323562552934e-05, |
|
"loss": 1.2038, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5440521240234375, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 1.2509, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0604, |
|
"grad_norm": 1.8293278217315674, |
|
"learning_rate": 4.99500084810256e-05, |
|
"loss": 1.2952, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 1.5876418352127075, |
|
"learning_rate": 4.994934437362513e-05, |
|
"loss": 1.1556, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.0612, |
|
"grad_norm": 1.4117156267166138, |
|
"learning_rate": 4.994867588862189e-05, |
|
"loss": 1.189, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.0616, |
|
"grad_norm": 1.6715811491012573, |
|
"learning_rate": 4.994800302613318e-05, |
|
"loss": 1.1876, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.062, |
|
"grad_norm": 1.551499366760254, |
|
"learning_rate": 4.994732578627706e-05, |
|
"loss": 1.2369, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.0624, |
|
"grad_norm": 1.9798293113708496, |
|
"learning_rate": 4.9946644169172355e-05, |
|
"loss": 1.214, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0628, |
|
"grad_norm": 1.97709321975708, |
|
"learning_rate": 4.994595817493867e-05, |
|
"loss": 1.2118, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.0632, |
|
"grad_norm": 1.8808115720748901, |
|
"learning_rate": 4.9945267803696364e-05, |
|
"loss": 1.223, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0636, |
|
"grad_norm": 2.002140998840332, |
|
"learning_rate": 4.9944573055566556e-05, |
|
"loss": 1.1949, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.5803587436676025, |
|
"learning_rate": 4.994387393067117e-05, |
|
"loss": 1.2275, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0644, |
|
"grad_norm": 1.8434706926345825, |
|
"learning_rate": 4.9943170429132855e-05, |
|
"loss": 1.1766, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.0648, |
|
"grad_norm": 1.7833911180496216, |
|
"learning_rate": 4.9942462551075056e-05, |
|
"loss": 1.2238, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0652, |
|
"grad_norm": 1.8478978872299194, |
|
"learning_rate": 4.994175029662198e-05, |
|
"loss": 1.2024, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.0656, |
|
"grad_norm": 1.597944974899292, |
|
"learning_rate": 4.994103366589859e-05, |
|
"loss": 1.261, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.066, |
|
"grad_norm": 1.515336275100708, |
|
"learning_rate": 4.994031265903063e-05, |
|
"loss": 1.2248, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.0664, |
|
"grad_norm": 1.7543236017227173, |
|
"learning_rate": 4.9939587276144616e-05, |
|
"loss": 1.216, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.0668, |
|
"grad_norm": 1.6050115823745728, |
|
"learning_rate": 4.993885751736781e-05, |
|
"loss": 1.1822, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 1.7370154857635498, |
|
"learning_rate": 4.993812338282826e-05, |
|
"loss": 1.278, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.0676, |
|
"grad_norm": 1.7703272104263306, |
|
"learning_rate": 4.993738487265478e-05, |
|
"loss": 1.2005, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 1.6980682611465454, |
|
"learning_rate": 4.993664198697694e-05, |
|
"loss": 1.1899, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.0684, |
|
"grad_norm": 1.7821124792099, |
|
"learning_rate": 4.99358947259251e-05, |
|
"loss": 1.219, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.0688, |
|
"grad_norm": 1.5321515798568726, |
|
"learning_rate": 4.993514308963036e-05, |
|
"loss": 1.2443, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.0692, |
|
"grad_norm": 1.5153166055679321, |
|
"learning_rate": 4.993438707822462e-05, |
|
"loss": 1.2262, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.0696, |
|
"grad_norm": 1.7875345945358276, |
|
"learning_rate": 4.993362669184051e-05, |
|
"loss": 1.2437, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6037136316299438, |
|
"learning_rate": 4.9932861930611454e-05, |
|
"loss": 1.2757, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 1.4552197456359863, |
|
"learning_rate": 4.993209279467164e-05, |
|
"loss": 1.1969, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.0708, |
|
"grad_norm": 1.4519599676132202, |
|
"learning_rate": 4.993131928415602e-05, |
|
"loss": 1.2608, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.0712, |
|
"grad_norm": 1.5513790845870972, |
|
"learning_rate": 4.993054139920032e-05, |
|
"loss": 1.2101, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.0716, |
|
"grad_norm": 1.6675373315811157, |
|
"learning_rate": 4.9929759139941e-05, |
|
"loss": 1.2782, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 1.501469373703003, |
|
"learning_rate": 4.992897250651535e-05, |
|
"loss": 1.1617, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0724, |
|
"grad_norm": 1.6184465885162354, |
|
"learning_rate": 4.992818149906138e-05, |
|
"loss": 1.2491, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.0728, |
|
"grad_norm": 1.8921860456466675, |
|
"learning_rate": 4.992738611771787e-05, |
|
"loss": 1.2323, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.0732, |
|
"grad_norm": 1.698572039604187, |
|
"learning_rate": 4.992658636262438e-05, |
|
"loss": 1.2315, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 1.8895962238311768, |
|
"learning_rate": 4.992578223392124e-05, |
|
"loss": 1.2285, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.074, |
|
"grad_norm": 1.4930404424667358, |
|
"learning_rate": 4.992497373174955e-05, |
|
"loss": 1.2123, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.0744, |
|
"grad_norm": 1.791544795036316, |
|
"learning_rate": 4.992416085625115e-05, |
|
"loss": 1.2318, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0748, |
|
"grad_norm": 1.7196767330169678, |
|
"learning_rate": 4.9923343607568684e-05, |
|
"loss": 1.1746, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.0752, |
|
"grad_norm": 1.732494592666626, |
|
"learning_rate": 4.992252198584554e-05, |
|
"loss": 1.2146, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.0756, |
|
"grad_norm": 1.6859017610549927, |
|
"learning_rate": 4.992169599122587e-05, |
|
"loss": 1.2061, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 1.8825091123580933, |
|
"learning_rate": 4.9920865623854615e-05, |
|
"loss": 1.1909, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0764, |
|
"grad_norm": 1.9849854707717896, |
|
"learning_rate": 4.9920030883877476e-05, |
|
"loss": 1.2249, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 2.0635294914245605, |
|
"learning_rate": 4.9919191771440905e-05, |
|
"loss": 1.2154, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.0772, |
|
"grad_norm": 1.7828255891799927, |
|
"learning_rate": 4.991834828669213e-05, |
|
"loss": 1.2488, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.0776, |
|
"grad_norm": 1.5887553691864014, |
|
"learning_rate": 4.991750042977916e-05, |
|
"loss": 1.1906, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.078, |
|
"grad_norm": 1.6549077033996582, |
|
"learning_rate": 4.991664820085074e-05, |
|
"loss": 1.2324, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.0784, |
|
"grad_norm": 1.843769907951355, |
|
"learning_rate": 4.991579160005644e-05, |
|
"loss": 1.2576, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.0788, |
|
"grad_norm": 1.4578025341033936, |
|
"learning_rate": 4.991493062754651e-05, |
|
"loss": 1.2846, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.0792, |
|
"grad_norm": 1.8506391048431396, |
|
"learning_rate": 4.991406528347206e-05, |
|
"loss": 1.2192, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.0796, |
|
"grad_norm": 1.5859005451202393, |
|
"learning_rate": 4.991319556798488e-05, |
|
"loss": 1.2622, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7537639141082764, |
|
"learning_rate": 4.991232148123761e-05, |
|
"loss": 1.2302, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0804, |
|
"grad_norm": 1.6842198371887207, |
|
"learning_rate": 4.99114430233836e-05, |
|
"loss": 1.2561, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.0808, |
|
"grad_norm": 1.5184071063995361, |
|
"learning_rate": 4.991056019457697e-05, |
|
"loss": 1.223, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.0812, |
|
"grad_norm": 1.5156010389328003, |
|
"learning_rate": 4.990967299497264e-05, |
|
"loss": 1.2002, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.0816, |
|
"grad_norm": 1.7296911478042603, |
|
"learning_rate": 4.990878142472628e-05, |
|
"loss": 1.2095, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.082, |
|
"grad_norm": 1.5455466508865356, |
|
"learning_rate": 4.990788548399432e-05, |
|
"loss": 1.2368, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.0824, |
|
"grad_norm": 1.8350162506103516, |
|
"learning_rate": 4.990698517293395e-05, |
|
"loss": 1.2379, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.0828, |
|
"grad_norm": 1.8894885778427124, |
|
"learning_rate": 4.9906080491703146e-05, |
|
"loss": 1.1947, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 1.892716646194458, |
|
"learning_rate": 4.990517144046064e-05, |
|
"loss": 1.2111, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0836, |
|
"grad_norm": 1.657734751701355, |
|
"learning_rate": 4.990425801936594e-05, |
|
"loss": 1.2507, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 1.800045132637024, |
|
"learning_rate": 4.990334022857932e-05, |
|
"loss": 1.2333, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.0844, |
|
"grad_norm": 1.6811712980270386, |
|
"learning_rate": 4.990241806826179e-05, |
|
"loss": 1.2072, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.0848, |
|
"grad_norm": 1.667485237121582, |
|
"learning_rate": 4.9901491538575185e-05, |
|
"loss": 1.1987, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0852, |
|
"grad_norm": 1.4977103471755981, |
|
"learning_rate": 4.9900560639682045e-05, |
|
"loss": 1.2305, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.0856, |
|
"grad_norm": 1.898008942604065, |
|
"learning_rate": 4.9899625371745726e-05, |
|
"loss": 1.1646, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.086, |
|
"grad_norm": 1.67731511592865, |
|
"learning_rate": 4.989868573493032e-05, |
|
"loss": 1.1999, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 1.7164236307144165, |
|
"learning_rate": 4.9897741729400705e-05, |
|
"loss": 1.2169, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.0868, |
|
"grad_norm": 1.7378337383270264, |
|
"learning_rate": 4.9896793355322495e-05, |
|
"loss": 1.2711, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.0872, |
|
"grad_norm": 1.4327822923660278, |
|
"learning_rate": 4.989584061286211e-05, |
|
"loss": 1.1496, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.0876, |
|
"grad_norm": 1.9170866012573242, |
|
"learning_rate": 4.989488350218671e-05, |
|
"loss": 1.2445, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 1.6763935089111328, |
|
"learning_rate": 4.9893922023464236e-05, |
|
"loss": 1.1969, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0884, |
|
"grad_norm": 1.7465934753417969, |
|
"learning_rate": 4.989295617686337e-05, |
|
"loss": 1.1862, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.0888, |
|
"grad_norm": 1.5775673389434814, |
|
"learning_rate": 4.9891985962553606e-05, |
|
"loss": 1.2265, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.0892, |
|
"grad_norm": 2.123936176300049, |
|
"learning_rate": 4.989101138070516e-05, |
|
"loss": 1.2077, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 1.6794697046279907, |
|
"learning_rate": 4.989003243148904e-05, |
|
"loss": 1.1972, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.656134843826294, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 1.2433, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.0904, |
|
"grad_norm": 1.6735204458236694, |
|
"learning_rate": 4.988806143164159e-05, |
|
"loss": 1.2143, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.0908, |
|
"grad_norm": 1.779127597808838, |
|
"learning_rate": 4.9887069381356094e-05, |
|
"loss": 1.2113, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.0912, |
|
"grad_norm": 1.6463018655776978, |
|
"learning_rate": 4.988607296439458e-05, |
|
"loss": 1.1745, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.0916, |
|
"grad_norm": 1.8336529731750488, |
|
"learning_rate": 4.988507218093189e-05, |
|
"loss": 1.2708, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 1.718851089477539, |
|
"learning_rate": 4.98840670311436e-05, |
|
"loss": 1.2398, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.0924, |
|
"grad_norm": 1.5948344469070435, |
|
"learning_rate": 4.988305751520609e-05, |
|
"loss": 1.2493, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 1.7609024047851562, |
|
"learning_rate": 4.988204363329648e-05, |
|
"loss": 1.2151, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.0932, |
|
"grad_norm": 1.860521674156189, |
|
"learning_rate": 4.988102538559268e-05, |
|
"loss": 1.2635, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.0936, |
|
"grad_norm": 1.8233569860458374, |
|
"learning_rate": 4.988000277227334e-05, |
|
"loss": 1.213, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.094, |
|
"grad_norm": 1.875961184501648, |
|
"learning_rate": 4.987897579351788e-05, |
|
"loss": 1.249, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.0944, |
|
"grad_norm": 1.5737124681472778, |
|
"learning_rate": 4.987794444950651e-05, |
|
"loss": 1.245, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.0948, |
|
"grad_norm": 1.6306177377700806, |
|
"learning_rate": 4.9876908740420175e-05, |
|
"loss": 1.1596, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.0952, |
|
"grad_norm": 1.912415862083435, |
|
"learning_rate": 4.9875868666440604e-05, |
|
"loss": 1.1932, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.0956, |
|
"grad_norm": 1.7863068580627441, |
|
"learning_rate": 4.9874824227750305e-05, |
|
"loss": 1.166, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.4861044883728027, |
|
"learning_rate": 4.987377542453251e-05, |
|
"loss": 1.2109, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0964, |
|
"grad_norm": 1.8580693006515503, |
|
"learning_rate": 4.987272225697125e-05, |
|
"loss": 1.2005, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.0968, |
|
"grad_norm": 1.5345180034637451, |
|
"learning_rate": 4.9871664725251314e-05, |
|
"loss": 1.2178, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.0972, |
|
"grad_norm": 1.6024930477142334, |
|
"learning_rate": 4.987060282955826e-05, |
|
"loss": 1.1837, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.0976, |
|
"grad_norm": 1.8284636735916138, |
|
"learning_rate": 4.986953657007841e-05, |
|
"loss": 1.2051, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.098, |
|
"grad_norm": 1.7044328451156616, |
|
"learning_rate": 4.986846594699883e-05, |
|
"loss": 1.1946, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.0984, |
|
"grad_norm": 1.8590692281723022, |
|
"learning_rate": 4.98673909605074e-05, |
|
"loss": 1.215, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.0988, |
|
"grad_norm": 1.660416603088379, |
|
"learning_rate": 4.986631161079272e-05, |
|
"loss": 1.2124, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 1.5783928632736206, |
|
"learning_rate": 4.986522789804417e-05, |
|
"loss": 1.2048, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.0996, |
|
"grad_norm": 1.6372880935668945, |
|
"learning_rate": 4.9864139822451905e-05, |
|
"loss": 1.1847, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.6335861682891846, |
|
"learning_rate": 4.9863047384206835e-05, |
|
"loss": 1.201, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1004, |
|
"grad_norm": 1.7448561191558838, |
|
"learning_rate": 4.9861950583500636e-05, |
|
"loss": 1.2208, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.1008, |
|
"grad_norm": 1.6538008451461792, |
|
"learning_rate": 4.9860849420525766e-05, |
|
"loss": 1.3027, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.1012, |
|
"grad_norm": 1.5223474502563477, |
|
"learning_rate": 4.9859743895475416e-05, |
|
"loss": 1.21, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.1016, |
|
"grad_norm": 2.5573835372924805, |
|
"learning_rate": 4.985863400854358e-05, |
|
"loss": 1.2278, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.102, |
|
"grad_norm": 1.7527281045913696, |
|
"learning_rate": 4.9857519759924974e-05, |
|
"loss": 1.2594, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 1.5259441137313843, |
|
"learning_rate": 4.9856401149815126e-05, |
|
"loss": 1.1852, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1028, |
|
"grad_norm": 1.6664983034133911, |
|
"learning_rate": 4.985527817841029e-05, |
|
"loss": 1.2257, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.1032, |
|
"grad_norm": 1.7659004926681519, |
|
"learning_rate": 4.985415084590752e-05, |
|
"loss": 1.1816, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.1036, |
|
"grad_norm": 1.7176613807678223, |
|
"learning_rate": 4.9853019152504607e-05, |
|
"loss": 1.2328, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 1.7339608669281006, |
|
"learning_rate": 4.985188309840012e-05, |
|
"loss": 1.3235, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1044, |
|
"grad_norm": 1.5208759307861328, |
|
"learning_rate": 4.985074268379338e-05, |
|
"loss": 1.2024, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.1048, |
|
"grad_norm": 1.7345401048660278, |
|
"learning_rate": 4.98495979088845e-05, |
|
"loss": 1.2577, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1052, |
|
"grad_norm": 1.6194533109664917, |
|
"learning_rate": 4.984844877387433e-05, |
|
"loss": 1.1717, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 1.9546769857406616, |
|
"learning_rate": 4.9847295278964514e-05, |
|
"loss": 1.2002, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.106, |
|
"grad_norm": 1.7305984497070312, |
|
"learning_rate": 4.984613742435742e-05, |
|
"loss": 1.2273, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.1064, |
|
"grad_norm": 1.9428414106369019, |
|
"learning_rate": 4.9844975210256217e-05, |
|
"loss": 1.1932, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.1068, |
|
"grad_norm": 1.6974749565124512, |
|
"learning_rate": 4.984380863686482e-05, |
|
"loss": 1.207, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.1072, |
|
"grad_norm": 1.6390514373779297, |
|
"learning_rate": 4.984263770438793e-05, |
|
"loss": 1.1626, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.1076, |
|
"grad_norm": 1.3905149698257446, |
|
"learning_rate": 4.9841462413030995e-05, |
|
"loss": 1.2294, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 1.6545757055282593, |
|
"learning_rate": 4.984028276300021e-05, |
|
"loss": 1.2109, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1084, |
|
"grad_norm": 1.6141990423202515, |
|
"learning_rate": 4.983909875450258e-05, |
|
"loss": 1.2104, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 1.4683756828308105, |
|
"learning_rate": 4.9837910387745845e-05, |
|
"loss": 1.1762, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.1092, |
|
"grad_norm": 1.566309928894043, |
|
"learning_rate": 4.983671766293851e-05, |
|
"loss": 1.2136, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.1096, |
|
"grad_norm": 1.6710453033447266, |
|
"learning_rate": 4.9835520580289854e-05, |
|
"loss": 1.2212, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.7142356634140015, |
|
"learning_rate": 4.983431914000991e-05, |
|
"loss": 1.1812, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.1104, |
|
"grad_norm": 1.8193789720535278, |
|
"learning_rate": 4.98331133423095e-05, |
|
"loss": 1.2175, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.1108, |
|
"grad_norm": 1.675352931022644, |
|
"learning_rate": 4.9831903187400166e-05, |
|
"loss": 1.2099, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.1112, |
|
"grad_norm": 1.8119478225708008, |
|
"learning_rate": 4.9830688675494265e-05, |
|
"loss": 1.2313, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.1116, |
|
"grad_norm": 1.5995213985443115, |
|
"learning_rate": 4.982946980680488e-05, |
|
"loss": 1.2507, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.719618797302246, |
|
"learning_rate": 4.982824658154589e-05, |
|
"loss": 1.1767, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1124, |
|
"grad_norm": 1.6213531494140625, |
|
"learning_rate": 4.982701899993189e-05, |
|
"loss": 1.2055, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.1128, |
|
"grad_norm": 1.788510799407959, |
|
"learning_rate": 4.9825787062178315e-05, |
|
"loss": 1.1826, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.1132, |
|
"grad_norm": 1.8999608755111694, |
|
"learning_rate": 4.982455076850129e-05, |
|
"loss": 1.2034, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.1136, |
|
"grad_norm": 1.538346767425537, |
|
"learning_rate": 4.982331011911774e-05, |
|
"loss": 1.1955, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.114, |
|
"grad_norm": 1.760783314704895, |
|
"learning_rate": 4.982206511424534e-05, |
|
"loss": 1.2228, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.1144, |
|
"grad_norm": 1.7535641193389893, |
|
"learning_rate": 4.982081575410256e-05, |
|
"loss": 1.1827, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.1148, |
|
"grad_norm": 1.7111599445343018, |
|
"learning_rate": 4.98195620389086e-05, |
|
"loss": 1.2226, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 1.7336255311965942, |
|
"learning_rate": 4.981830396888344e-05, |
|
"loss": 1.2594, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.1156, |
|
"grad_norm": 1.9409478902816772, |
|
"learning_rate": 4.981704154424781e-05, |
|
"loss": 1.2773, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 1.6650772094726562, |
|
"learning_rate": 4.9815774765223226e-05, |
|
"loss": 1.1894, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1164, |
|
"grad_norm": 1.776209831237793, |
|
"learning_rate": 4.9814503632031954e-05, |
|
"loss": 1.2409, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.1168, |
|
"grad_norm": 1.806114912033081, |
|
"learning_rate": 4.981322814489703e-05, |
|
"loss": 1.2251, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.1172, |
|
"grad_norm": 1.8370269536972046, |
|
"learning_rate": 4.9811948304042234e-05, |
|
"loss": 1.1519, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.1176, |
|
"grad_norm": 1.9353857040405273, |
|
"learning_rate": 4.981066410969215e-05, |
|
"loss": 1.2432, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.118, |
|
"grad_norm": 1.5059784650802612, |
|
"learning_rate": 4.980937556207208e-05, |
|
"loss": 1.1641, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 1.5918395519256592, |
|
"learning_rate": 4.980808266140813e-05, |
|
"loss": 1.2464, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.1188, |
|
"grad_norm": 1.6992462873458862, |
|
"learning_rate": 4.980678540792715e-05, |
|
"loss": 1.2007, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.1192, |
|
"grad_norm": 1.7134628295898438, |
|
"learning_rate": 4.980548380185674e-05, |
|
"loss": 1.1925, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.1196, |
|
"grad_norm": 1.8130481243133545, |
|
"learning_rate": 4.9804177843425295e-05, |
|
"loss": 1.1775, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.811142086982727, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 1.2813, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1204, |
|
"grad_norm": 1.7274268865585327, |
|
"learning_rate": 4.980155287039662e-05, |
|
"loss": 1.2213, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.1208, |
|
"grad_norm": 1.8031953573226929, |
|
"learning_rate": 4.980023385625996e-05, |
|
"loss": 1.1459, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.1212, |
|
"grad_norm": 1.6211178302764893, |
|
"learning_rate": 4.979891049068342e-05, |
|
"loss": 1.1575, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 1.4958020448684692, |
|
"learning_rate": 4.979758277389919e-05, |
|
"loss": 1.1516, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.122, |
|
"grad_norm": 1.8061717748641968, |
|
"learning_rate": 4.9796250706140224e-05, |
|
"loss": 1.2135, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.1224, |
|
"grad_norm": 1.4984862804412842, |
|
"learning_rate": 4.979491428764026e-05, |
|
"loss": 1.2702, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.1228, |
|
"grad_norm": 1.9237560033798218, |
|
"learning_rate": 4.979357351863377e-05, |
|
"loss": 1.2653, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.1232, |
|
"grad_norm": 1.7026283740997314, |
|
"learning_rate": 4.979222839935602e-05, |
|
"loss": 1.2467, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.1236, |
|
"grad_norm": 1.523640751838684, |
|
"learning_rate": 4.979087893004302e-05, |
|
"loss": 1.2177, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 1.5386415719985962, |
|
"learning_rate": 4.9789525110931545e-05, |
|
"loss": 1.1528, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.1244, |
|
"grad_norm": 1.6132177114486694, |
|
"learning_rate": 4.9788166942259135e-05, |
|
"loss": 1.1925, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 1.8370720148086548, |
|
"learning_rate": 4.9786804424264085e-05, |
|
"loss": 1.1702, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.1252, |
|
"grad_norm": 1.78260338306427, |
|
"learning_rate": 4.978543755718549e-05, |
|
"loss": 1.2394, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.1256, |
|
"grad_norm": 1.3837835788726807, |
|
"learning_rate": 4.978406634126315e-05, |
|
"loss": 1.2256, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.126, |
|
"grad_norm": 1.776090383529663, |
|
"learning_rate": 4.978269077673767e-05, |
|
"loss": 1.2859, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.1264, |
|
"grad_norm": 1.423762321472168, |
|
"learning_rate": 4.9781310863850405e-05, |
|
"loss": 1.2486, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.1268, |
|
"grad_norm": 1.5128095149993896, |
|
"learning_rate": 4.977992660284347e-05, |
|
"loss": 1.188, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.1272, |
|
"grad_norm": 1.7844202518463135, |
|
"learning_rate": 4.977853799395976e-05, |
|
"loss": 1.1579, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.1276, |
|
"grad_norm": 1.7243590354919434, |
|
"learning_rate": 4.9777145037442906e-05, |
|
"loss": 1.2631, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 1.7235385179519653, |
|
"learning_rate": 4.977574773353732e-05, |
|
"loss": 1.144, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1284, |
|
"grad_norm": 1.522979974746704, |
|
"learning_rate": 4.9774346082488176e-05, |
|
"loss": 1.2471, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.1288, |
|
"grad_norm": 1.3776745796203613, |
|
"learning_rate": 4.9772940084541405e-05, |
|
"loss": 1.222, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.1292, |
|
"grad_norm": 1.7246540784835815, |
|
"learning_rate": 4.97715297399437e-05, |
|
"loss": 1.1857, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.1296, |
|
"grad_norm": 1.747706413269043, |
|
"learning_rate": 4.977011504894252e-05, |
|
"loss": 1.1973, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.6644729375839233, |
|
"learning_rate": 4.976869601178609e-05, |
|
"loss": 1.1559, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.1304, |
|
"grad_norm": 1.5264381170272827, |
|
"learning_rate": 4.9767272628723396e-05, |
|
"loss": 1.256, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.1308, |
|
"grad_norm": 1.5906404256820679, |
|
"learning_rate": 4.9765844900004176e-05, |
|
"loss": 1.1486, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 1.4463717937469482, |
|
"learning_rate": 4.9764412825878943e-05, |
|
"loss": 1.2871, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.1316, |
|
"grad_norm": 1.727552056312561, |
|
"learning_rate": 4.976297640659897e-05, |
|
"loss": 1.178, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 1.5339679718017578, |
|
"learning_rate": 4.976153564241628e-05, |
|
"loss": 1.2556, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1324, |
|
"grad_norm": 1.5562660694122314, |
|
"learning_rate": 4.9760090533583686e-05, |
|
"loss": 1.1758, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.1328, |
|
"grad_norm": 1.6301156282424927, |
|
"learning_rate": 4.975864108035474e-05, |
|
"loss": 1.2246, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.1332, |
|
"grad_norm": 1.9611403942108154, |
|
"learning_rate": 4.975718728298375e-05, |
|
"loss": 1.2752, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.1336, |
|
"grad_norm": 1.4898903369903564, |
|
"learning_rate": 4.975572914172582e-05, |
|
"loss": 1.1814, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.134, |
|
"grad_norm": 1.7393544912338257, |
|
"learning_rate": 4.975426665683678e-05, |
|
"loss": 1.207, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 1.5046724081039429, |
|
"learning_rate": 4.975279982857324e-05, |
|
"loss": 1.2219, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.1348, |
|
"grad_norm": 1.8701568841934204, |
|
"learning_rate": 4.9751328657192565e-05, |
|
"loss": 1.2013, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.1352, |
|
"grad_norm": 1.7540148496627808, |
|
"learning_rate": 4.97498531429529e-05, |
|
"loss": 1.1784, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.1356, |
|
"grad_norm": 2.0447638034820557, |
|
"learning_rate": 4.974837328611312e-05, |
|
"loss": 1.2023, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 1.6013314723968506, |
|
"learning_rate": 4.9746889086932895e-05, |
|
"loss": 1.2658, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1364, |
|
"grad_norm": 1.83162522315979, |
|
"learning_rate": 4.974540054567264e-05, |
|
"loss": 1.2172, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.1368, |
|
"grad_norm": 1.6083216667175293, |
|
"learning_rate": 4.9743907662593524e-05, |
|
"loss": 1.2236, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.1372, |
|
"grad_norm": 1.5732282400131226, |
|
"learning_rate": 4.97424104379575e-05, |
|
"loss": 1.1822, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 1.6815297603607178, |
|
"learning_rate": 4.974090887202726e-05, |
|
"loss": 1.2386, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.138, |
|
"grad_norm": 1.5495154857635498, |
|
"learning_rate": 4.9739402965066276e-05, |
|
"loss": 1.1664, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.1384, |
|
"grad_norm": 1.4523470401763916, |
|
"learning_rate": 4.9737892717338774e-05, |
|
"loss": 1.1526, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.1388, |
|
"grad_norm": 1.6146246194839478, |
|
"learning_rate": 4.973637812910973e-05, |
|
"loss": 1.1802, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.1392, |
|
"grad_norm": 1.5565937757492065, |
|
"learning_rate": 4.9734859200644905e-05, |
|
"loss": 1.2272, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.1396, |
|
"grad_norm": 1.7188202142715454, |
|
"learning_rate": 4.9733335932210814e-05, |
|
"loss": 1.3008, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.7676734924316406, |
|
"learning_rate": 4.9731808324074717e-05, |
|
"loss": 1.2122, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.1404, |
|
"grad_norm": 1.3866500854492188, |
|
"learning_rate": 4.973027637650464e-05, |
|
"loss": 1.2475, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 1.736222267150879, |
|
"learning_rate": 4.97287400897694e-05, |
|
"loss": 1.1491, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.1412, |
|
"grad_norm": 1.7289378643035889, |
|
"learning_rate": 4.972719946413854e-05, |
|
"loss": 1.2638, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.1416, |
|
"grad_norm": 1.5107754468917847, |
|
"learning_rate": 4.972565449988239e-05, |
|
"loss": 1.2414, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.142, |
|
"grad_norm": 1.6020653247833252, |
|
"learning_rate": 4.972410519727201e-05, |
|
"loss": 1.154, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.1424, |
|
"grad_norm": 1.57801353931427, |
|
"learning_rate": 4.972255155657925e-05, |
|
"loss": 1.2212, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.1428, |
|
"grad_norm": 1.7181590795516968, |
|
"learning_rate": 4.972099357807671e-05, |
|
"loss": 1.2248, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.1432, |
|
"grad_norm": 1.356872320175171, |
|
"learning_rate": 4.9719431262037755e-05, |
|
"loss": 1.0836, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.1436, |
|
"grad_norm": 1.4767268896102905, |
|
"learning_rate": 4.9717864608736506e-05, |
|
"loss": 1.2183, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.496610164642334, |
|
"learning_rate": 4.971629361844785e-05, |
|
"loss": 1.2099, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.1444, |
|
"grad_norm": 1.6755746603012085, |
|
"learning_rate": 4.971471829144743e-05, |
|
"loss": 1.1769, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.1448, |
|
"grad_norm": 1.4553226232528687, |
|
"learning_rate": 4.9713138628011654e-05, |
|
"loss": 1.2173, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.1452, |
|
"grad_norm": 1.7685695886611938, |
|
"learning_rate": 4.971155462841769e-05, |
|
"loss": 1.1878, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.1456, |
|
"grad_norm": 1.6360673904418945, |
|
"learning_rate": 4.9709966292943455e-05, |
|
"loss": 1.1759, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.146, |
|
"grad_norm": 1.7544503211975098, |
|
"learning_rate": 4.9708373621867656e-05, |
|
"loss": 1.1912, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.1464, |
|
"grad_norm": 1.6585785150527954, |
|
"learning_rate": 4.9706776615469716e-05, |
|
"loss": 1.2538, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.1468, |
|
"grad_norm": 1.6937992572784424, |
|
"learning_rate": 4.970517527402988e-05, |
|
"loss": 1.1707, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 1.5035377740859985, |
|
"learning_rate": 4.970356959782909e-05, |
|
"loss": 1.0796, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.1476, |
|
"grad_norm": 1.5920889377593994, |
|
"learning_rate": 4.970195958714909e-05, |
|
"loss": 1.2042, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 1.7095478773117065, |
|
"learning_rate": 4.970034524227238e-05, |
|
"loss": 1.2424, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.1484, |
|
"grad_norm": 1.565622091293335, |
|
"learning_rate": 4.96987265634822e-05, |
|
"loss": 1.2442, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.1488, |
|
"grad_norm": 1.5227761268615723, |
|
"learning_rate": 4.9697103551062556e-05, |
|
"loss": 1.2286, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.1492, |
|
"grad_norm": 1.8364589214324951, |
|
"learning_rate": 4.9695476205298235e-05, |
|
"loss": 1.2409, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.1496, |
|
"grad_norm": 1.476838231086731, |
|
"learning_rate": 4.969384452647477e-05, |
|
"loss": 1.2491, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.617569088935852, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 1.1515, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 1.623694658279419, |
|
"learning_rate": 4.969056817079633e-05, |
|
"loss": 1.2829, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.1508, |
|
"grad_norm": 1.8498085737228394, |
|
"learning_rate": 4.968892349451621e-05, |
|
"loss": 1.2083, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.1512, |
|
"grad_norm": 1.544599175453186, |
|
"learning_rate": 4.968727448632669e-05, |
|
"loss": 1.1898, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.1516, |
|
"grad_norm": 1.7508872747421265, |
|
"learning_rate": 4.968562114651709e-05, |
|
"loss": 1.2359, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 1.6149662733078003, |
|
"learning_rate": 4.968396347537751e-05, |
|
"loss": 1.2036, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1524, |
|
"grad_norm": 1.5705629587173462, |
|
"learning_rate": 4.96823014731988e-05, |
|
"loss": 1.2044, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.1528, |
|
"grad_norm": 1.7372088432312012, |
|
"learning_rate": 4.9680635140272575e-05, |
|
"loss": 1.3014, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.1532, |
|
"grad_norm": 1.4629701375961304, |
|
"learning_rate": 4.967896447689121e-05, |
|
"loss": 1.2249, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 1.6758002042770386, |
|
"learning_rate": 4.967728948334784e-05, |
|
"loss": 1.2519, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.154, |
|
"grad_norm": 1.6669517755508423, |
|
"learning_rate": 4.967561015993635e-05, |
|
"loss": 1.2515, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.1544, |
|
"grad_norm": 1.556141972541809, |
|
"learning_rate": 4.9673926506951404e-05, |
|
"loss": 1.1648, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.1548, |
|
"grad_norm": 1.5720252990722656, |
|
"learning_rate": 4.967223852468842e-05, |
|
"loss": 1.1668, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.1552, |
|
"grad_norm": 1.5914875268936157, |
|
"learning_rate": 4.967054621344356e-05, |
|
"loss": 1.2718, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.1556, |
|
"grad_norm": 1.8460205793380737, |
|
"learning_rate": 4.966884957351375e-05, |
|
"loss": 1.215, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 1.4069819450378418, |
|
"learning_rate": 4.96671486051967e-05, |
|
"loss": 1.2576, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.1564, |
|
"grad_norm": 1.8129643201828003, |
|
"learning_rate": 4.966544330879085e-05, |
|
"loss": 1.1778, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 1.6528633832931519, |
|
"learning_rate": 4.966373368459541e-05, |
|
"loss": 1.1827, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1572, |
|
"grad_norm": 1.5496547222137451, |
|
"learning_rate": 4.966201973291036e-05, |
|
"loss": 1.16, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.1576, |
|
"grad_norm": 1.7578569650650024, |
|
"learning_rate": 4.966030145403642e-05, |
|
"loss": 1.2164, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.158, |
|
"grad_norm": 1.6833785772323608, |
|
"learning_rate": 4.9658578848275076e-05, |
|
"loss": 1.2729, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.1584, |
|
"grad_norm": 1.4320454597473145, |
|
"learning_rate": 4.965685191592859e-05, |
|
"loss": 1.2141, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.1588, |
|
"grad_norm": 1.5332552194595337, |
|
"learning_rate": 4.9655120657299945e-05, |
|
"loss": 1.2568, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.1592, |
|
"grad_norm": 1.4154534339904785, |
|
"learning_rate": 4.965338507269294e-05, |
|
"loss": 1.2167, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.1596, |
|
"grad_norm": 1.6980712413787842, |
|
"learning_rate": 4.965164516241206e-05, |
|
"loss": 1.178, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.452364444732666, |
|
"learning_rate": 4.964990092676263e-05, |
|
"loss": 1.2442, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1604, |
|
"grad_norm": 1.5581384897232056, |
|
"learning_rate": 4.964815236605066e-05, |
|
"loss": 1.1763, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.1608, |
|
"grad_norm": 1.5522022247314453, |
|
"learning_rate": 4.964639948058297e-05, |
|
"loss": 1.2057, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.1612, |
|
"grad_norm": 1.8759955167770386, |
|
"learning_rate": 4.964464227066712e-05, |
|
"loss": 1.2072, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.1616, |
|
"grad_norm": 1.6909701824188232, |
|
"learning_rate": 4.964288073661142e-05, |
|
"loss": 1.1904, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.162, |
|
"grad_norm": 1.7072064876556396, |
|
"learning_rate": 4.9641114878724956e-05, |
|
"loss": 1.255, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.1624, |
|
"grad_norm": 1.663236379623413, |
|
"learning_rate": 4.963934469731756e-05, |
|
"loss": 1.2245, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.1628, |
|
"grad_norm": 1.6193203926086426, |
|
"learning_rate": 4.963757019269983e-05, |
|
"loss": 1.2019, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 3.217263698577881, |
|
"learning_rate": 4.963579136518312e-05, |
|
"loss": 1.1895, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.1636, |
|
"grad_norm": 1.704827904701233, |
|
"learning_rate": 4.963400821507954e-05, |
|
"loss": 1.1823, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 1.611045002937317, |
|
"learning_rate": 4.9632220742701965e-05, |
|
"loss": 1.2057, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.1644, |
|
"grad_norm": 1.6475915908813477, |
|
"learning_rate": 4.963042894836403e-05, |
|
"loss": 1.2055, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.1648, |
|
"grad_norm": 1.3033784627914429, |
|
"learning_rate": 4.96286328323801e-05, |
|
"loss": 1.3009, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.1652, |
|
"grad_norm": 1.928715705871582, |
|
"learning_rate": 4.962683239506534e-05, |
|
"loss": 1.1779, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.1656, |
|
"grad_norm": 1.7251336574554443, |
|
"learning_rate": 4.962502763673565e-05, |
|
"loss": 1.2536, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.166, |
|
"grad_norm": 1.4346777200698853, |
|
"learning_rate": 4.9623218557707694e-05, |
|
"loss": 1.2126, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 1.4237911701202393, |
|
"learning_rate": 4.96214051582989e-05, |
|
"loss": 1.1766, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.1668, |
|
"grad_norm": 2.051907539367676, |
|
"learning_rate": 4.961958743882742e-05, |
|
"loss": 1.2302, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.1672, |
|
"grad_norm": 1.6428817510604858, |
|
"learning_rate": 4.961776539961222e-05, |
|
"loss": 1.164, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.1676, |
|
"grad_norm": 1.492691993713379, |
|
"learning_rate": 4.961593904097297e-05, |
|
"loss": 1.2141, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 1.70375394821167, |
|
"learning_rate": 4.9614108363230135e-05, |
|
"loss": 1.2091, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1684, |
|
"grad_norm": 1.7503658533096313, |
|
"learning_rate": 4.961227336670493e-05, |
|
"loss": 1.1867, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.1688, |
|
"grad_norm": 1.8217905759811401, |
|
"learning_rate": 4.961043405171931e-05, |
|
"loss": 1.1937, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.1692, |
|
"grad_norm": 1.8402591943740845, |
|
"learning_rate": 4.9608590418596016e-05, |
|
"loss": 1.1849, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 1.6328750848770142, |
|
"learning_rate": 4.960674246765851e-05, |
|
"loss": 1.2358, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.8972320556640625, |
|
"learning_rate": 4.960489019923105e-05, |
|
"loss": 1.1965, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.1704, |
|
"grad_norm": 1.7403242588043213, |
|
"learning_rate": 4.9603033613638626e-05, |
|
"loss": 1.2007, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.1708, |
|
"grad_norm": 1.811218023300171, |
|
"learning_rate": 4.9601172711207005e-05, |
|
"loss": 1.1738, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.1712, |
|
"grad_norm": 1.8417551517486572, |
|
"learning_rate": 4.959930749226269e-05, |
|
"loss": 1.2474, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.1716, |
|
"grad_norm": 1.7173854112625122, |
|
"learning_rate": 4.9597437957132955e-05, |
|
"loss": 1.1534, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 1.5625674724578857, |
|
"learning_rate": 4.959556410614582e-05, |
|
"loss": 1.1943, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.1724, |
|
"grad_norm": 1.8985899686813354, |
|
"learning_rate": 4.959368593963007e-05, |
|
"loss": 1.1962, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 1.7198944091796875, |
|
"learning_rate": 4.959180345791528e-05, |
|
"loss": 1.2023, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.1732, |
|
"grad_norm": 1.4986478090286255, |
|
"learning_rate": 4.95899166613317e-05, |
|
"loss": 1.2129, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.1736, |
|
"grad_norm": 1.4849358797073364, |
|
"learning_rate": 4.958802555021042e-05, |
|
"loss": 1.1964, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.174, |
|
"grad_norm": 1.5313656330108643, |
|
"learning_rate": 4.958613012488324e-05, |
|
"loss": 1.227, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.1744, |
|
"grad_norm": 1.6085180044174194, |
|
"learning_rate": 4.958423038568274e-05, |
|
"loss": 1.1662, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.1748, |
|
"grad_norm": 1.8227916955947876, |
|
"learning_rate": 4.9582326332942244e-05, |
|
"loss": 1.2102, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.1752, |
|
"grad_norm": 1.7549911737442017, |
|
"learning_rate": 4.958041796699583e-05, |
|
"loss": 1.1915, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.1756, |
|
"grad_norm": 2.0257728099823, |
|
"learning_rate": 4.957850528817834e-05, |
|
"loss": 1.1737, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.7481350898742676, |
|
"learning_rate": 4.9576588296825386e-05, |
|
"loss": 1.2165, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1764, |
|
"grad_norm": 1.3570259809494019, |
|
"learning_rate": 4.957466699327331e-05, |
|
"loss": 1.2375, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.1768, |
|
"grad_norm": 1.782902717590332, |
|
"learning_rate": 4.957274137785922e-05, |
|
"loss": 1.2078, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.1772, |
|
"grad_norm": 1.4159862995147705, |
|
"learning_rate": 4.9570811450921e-05, |
|
"loss": 1.1476, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.1776, |
|
"grad_norm": 1.5632498264312744, |
|
"learning_rate": 4.956887721279726e-05, |
|
"loss": 1.2646, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.178, |
|
"grad_norm": 1.5948270559310913, |
|
"learning_rate": 4.9566938663827377e-05, |
|
"loss": 1.1947, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.1784, |
|
"grad_norm": 1.6205294132232666, |
|
"learning_rate": 4.95649958043515e-05, |
|
"loss": 1.1819, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.1788, |
|
"grad_norm": 1.8377573490142822, |
|
"learning_rate": 4.9563048634710516e-05, |
|
"loss": 1.202, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 1.8803945779800415, |
|
"learning_rate": 4.956109715524608e-05, |
|
"loss": 1.2488, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.1796, |
|
"grad_norm": 1.6330218315124512, |
|
"learning_rate": 4.9559141366300594e-05, |
|
"loss": 1.168, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.9217214584350586, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 1.2207, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.1804, |
|
"grad_norm": 1.7866103649139404, |
|
"learning_rate": 4.9555216861339876e-05, |
|
"loss": 1.2106, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.1808, |
|
"grad_norm": 1.7389551401138306, |
|
"learning_rate": 4.955324814601324e-05, |
|
"loss": 1.176, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.1812, |
|
"grad_norm": 1.8251265287399292, |
|
"learning_rate": 4.955127512258273e-05, |
|
"loss": 1.1974, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.1816, |
|
"grad_norm": 1.402008056640625, |
|
"learning_rate": 4.954929779139455e-05, |
|
"loss": 1.2318, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.182, |
|
"grad_norm": 1.4329712390899658, |
|
"learning_rate": 4.954731615279563e-05, |
|
"loss": 1.1501, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 1.6272238492965698, |
|
"learning_rate": 4.9545330207133664e-05, |
|
"loss": 1.1612, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.1828, |
|
"grad_norm": 1.5852447748184204, |
|
"learning_rate": 4.954333995475712e-05, |
|
"loss": 1.2153, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.1832, |
|
"grad_norm": 1.7610340118408203, |
|
"learning_rate": 4.9541345396015193e-05, |
|
"loss": 1.2303, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.1836, |
|
"grad_norm": 1.5253504514694214, |
|
"learning_rate": 4.953934653125786e-05, |
|
"loss": 1.1946, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 1.3086066246032715, |
|
"learning_rate": 4.953734336083583e-05, |
|
"loss": 1.1768, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.1844, |
|
"grad_norm": 1.6699544191360474, |
|
"learning_rate": 4.9535335885100575e-05, |
|
"loss": 1.1911, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.1848, |
|
"grad_norm": 1.6090809106826782, |
|
"learning_rate": 4.953332410440435e-05, |
|
"loss": 1.1878, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.1852, |
|
"grad_norm": 1.6638312339782715, |
|
"learning_rate": 4.953130801910011e-05, |
|
"loss": 1.2607, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 1.8563107252120972, |
|
"learning_rate": 4.952928762954161e-05, |
|
"loss": 1.1796, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.186, |
|
"grad_norm": 1.6669505834579468, |
|
"learning_rate": 4.952726293608335e-05, |
|
"loss": 1.2501, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.1864, |
|
"grad_norm": 1.6013710498809814, |
|
"learning_rate": 4.952523393908059e-05, |
|
"loss": 1.2109, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.1868, |
|
"grad_norm": 1.4567033052444458, |
|
"learning_rate": 4.952320063888932e-05, |
|
"loss": 1.182, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.1872, |
|
"grad_norm": 1.8492445945739746, |
|
"learning_rate": 4.952116303586631e-05, |
|
"loss": 1.144, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.1876, |
|
"grad_norm": 1.6545616388320923, |
|
"learning_rate": 4.951912113036908e-05, |
|
"loss": 1.2085, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 2.060640573501587, |
|
"learning_rate": 4.951707492275589e-05, |
|
"loss": 1.2282, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.1884, |
|
"grad_norm": 1.481137752532959, |
|
"learning_rate": 4.951502441338578e-05, |
|
"loss": 1.2097, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 1.6217643022537231, |
|
"learning_rate": 4.951296960261853e-05, |
|
"loss": 1.1946, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.1892, |
|
"grad_norm": 1.6603466272354126, |
|
"learning_rate": 4.9510910490814666e-05, |
|
"loss": 1.1917, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.1896, |
|
"grad_norm": 1.6394786834716797, |
|
"learning_rate": 4.9508847078335495e-05, |
|
"loss": 1.2794, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5222853422164917, |
|
"learning_rate": 4.9506779365543046e-05, |
|
"loss": 1.2223, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.1904, |
|
"grad_norm": 1.431310772895813, |
|
"learning_rate": 4.9504707352800125e-05, |
|
"loss": 1.2508, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.1908, |
|
"grad_norm": 1.5400367975234985, |
|
"learning_rate": 4.950263104047031e-05, |
|
"loss": 1.2497, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.1912, |
|
"grad_norm": 1.5690429210662842, |
|
"learning_rate": 4.950055042891786e-05, |
|
"loss": 1.2056, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.1916, |
|
"grad_norm": 1.645679235458374, |
|
"learning_rate": 4.949846551850788e-05, |
|
"loss": 1.1822, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.8032221794128418, |
|
"learning_rate": 4.949637630960617e-05, |
|
"loss": 1.2102, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1924, |
|
"grad_norm": 1.5847824811935425, |
|
"learning_rate": 4.949428280257932e-05, |
|
"loss": 1.2044, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.1928, |
|
"grad_norm": 1.5460349321365356, |
|
"learning_rate": 4.9492184997794624e-05, |
|
"loss": 1.1794, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.1932, |
|
"grad_norm": 1.4651161432266235, |
|
"learning_rate": 4.949008289562019e-05, |
|
"loss": 1.1904, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.1936, |
|
"grad_norm": 1.5283023118972778, |
|
"learning_rate": 4.948797649642484e-05, |
|
"loss": 1.2205, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.194, |
|
"grad_norm": 1.733211874961853, |
|
"learning_rate": 4.948586580057816e-05, |
|
"loss": 1.2324, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.1944, |
|
"grad_norm": 1.4706699848175049, |
|
"learning_rate": 4.94837508084505e-05, |
|
"loss": 1.17, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.1948, |
|
"grad_norm": 1.7306286096572876, |
|
"learning_rate": 4.948163152041295e-05, |
|
"loss": 1.2305, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 1.6754021644592285, |
|
"learning_rate": 4.9479507936837364e-05, |
|
"loss": 1.2448, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.1956, |
|
"grad_norm": 1.6080971956253052, |
|
"learning_rate": 4.9477380058096343e-05, |
|
"loss": 1.2214, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 1.6312017440795898, |
|
"learning_rate": 4.947524788456325e-05, |
|
"loss": 1.2026, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.1964, |
|
"grad_norm": 1.5370644330978394, |
|
"learning_rate": 4.947311141661218e-05, |
|
"loss": 1.2314, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.1968, |
|
"grad_norm": 1.5101724863052368, |
|
"learning_rate": 4.947097065461801e-05, |
|
"loss": 1.1738, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.1972, |
|
"grad_norm": 1.5563039779663086, |
|
"learning_rate": 4.946882559895635e-05, |
|
"loss": 1.196, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.1976, |
|
"grad_norm": 1.8207404613494873, |
|
"learning_rate": 4.9466676250003576e-05, |
|
"loss": 1.2136, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.198, |
|
"grad_norm": 1.5161926746368408, |
|
"learning_rate": 4.9464522608136805e-05, |
|
"loss": 1.2426, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 1.722732663154602, |
|
"learning_rate": 4.946236467373392e-05, |
|
"loss": 1.1381, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.1988, |
|
"grad_norm": 1.4065109491348267, |
|
"learning_rate": 4.946020244717355e-05, |
|
"loss": 1.2316, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.1992, |
|
"grad_norm": 1.5711966753005981, |
|
"learning_rate": 4.945803592883509e-05, |
|
"loss": 1.1682, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.1996, |
|
"grad_norm": 1.6523547172546387, |
|
"learning_rate": 4.945586511909865e-05, |
|
"loss": 1.1923, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4545037746429443, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 1.2123, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2004, |
|
"grad_norm": 1.5507062673568726, |
|
"learning_rate": 4.9451510626956196e-05, |
|
"loss": 1.1574, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.2008, |
|
"grad_norm": 1.5979762077331543, |
|
"learning_rate": 4.944932694531422e-05, |
|
"loss": 1.2123, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.2012, |
|
"grad_norm": 1.7365368604660034, |
|
"learning_rate": 4.944713897380235e-05, |
|
"loss": 1.1493, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 1.5356484651565552, |
|
"learning_rate": 4.9444946712804494e-05, |
|
"loss": 1.1729, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.202, |
|
"grad_norm": 1.5858573913574219, |
|
"learning_rate": 4.9442750162705295e-05, |
|
"loss": 1.2178, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.2024, |
|
"grad_norm": 1.4931632280349731, |
|
"learning_rate": 4.9440549323890176e-05, |
|
"loss": 1.2411, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.2028, |
|
"grad_norm": 1.933543086051941, |
|
"learning_rate": 4.943834419674529e-05, |
|
"loss": 1.2217, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.2032, |
|
"grad_norm": 2.18145489692688, |
|
"learning_rate": 4.943613478165753e-05, |
|
"loss": 1.1514, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.2036, |
|
"grad_norm": 1.430503249168396, |
|
"learning_rate": 4.943392107901458e-05, |
|
"loss": 1.2466, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 1.3851200342178345, |
|
"learning_rate": 4.943170308920484e-05, |
|
"loss": 1.2234, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2044, |
|
"grad_norm": 1.4635603427886963, |
|
"learning_rate": 4.942948081261749e-05, |
|
"loss": 1.1526, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 1.6983256340026855, |
|
"learning_rate": 4.9427254249642444e-05, |
|
"loss": 1.2161, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.2052, |
|
"grad_norm": 1.7087597846984863, |
|
"learning_rate": 4.942502340067038e-05, |
|
"loss": 1.2492, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.2056, |
|
"grad_norm": 1.786859393119812, |
|
"learning_rate": 4.9422788266092715e-05, |
|
"loss": 1.1799, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.206, |
|
"grad_norm": 1.4722228050231934, |
|
"learning_rate": 4.942054884630162e-05, |
|
"loss": 1.2395, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.2064, |
|
"grad_norm": 1.6726198196411133, |
|
"learning_rate": 4.941830514169004e-05, |
|
"loss": 1.148, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.2068, |
|
"grad_norm": 1.4107918739318848, |
|
"learning_rate": 4.941605715265164e-05, |
|
"loss": 1.1613, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.2072, |
|
"grad_norm": 1.531030297279358, |
|
"learning_rate": 4.941380487958086e-05, |
|
"loss": 1.3184, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.2076, |
|
"grad_norm": 1.9271130561828613, |
|
"learning_rate": 4.941154832287288e-05, |
|
"loss": 1.2318, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.493416428565979, |
|
"learning_rate": 4.940928748292363e-05, |
|
"loss": 1.1343, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2084, |
|
"grad_norm": 1.7135025262832642, |
|
"learning_rate": 4.9407022360129796e-05, |
|
"loss": 1.1987, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.2088, |
|
"grad_norm": 1.576484203338623, |
|
"learning_rate": 4.9404752954888824e-05, |
|
"loss": 1.2837, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.2092, |
|
"grad_norm": 1.5237318277359009, |
|
"learning_rate": 4.9402479267598887e-05, |
|
"loss": 1.1382, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.2096, |
|
"grad_norm": 1.70023512840271, |
|
"learning_rate": 4.940020129865895e-05, |
|
"loss": 1.2325, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4940378665924072, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 1.1978, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.2104, |
|
"grad_norm": 1.4972904920578003, |
|
"learning_rate": 4.939563251742855e-05, |
|
"loss": 1.1918, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.2108, |
|
"grad_norm": 1.4454622268676758, |
|
"learning_rate": 4.939334170593972e-05, |
|
"loss": 1.1975, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 1.844054102897644, |
|
"learning_rate": 4.939104661440415e-05, |
|
"loss": 1.1543, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2116, |
|
"grad_norm": 1.446661114692688, |
|
"learning_rate": 4.938874724322454e-05, |
|
"loss": 1.2034, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 1.7901402711868286, |
|
"learning_rate": 4.938644359280433e-05, |
|
"loss": 1.1945, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.2124, |
|
"grad_norm": 1.6717280149459839, |
|
"learning_rate": 4.938413566354772e-05, |
|
"loss": 1.1632, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.2128, |
|
"grad_norm": 1.4415397644042969, |
|
"learning_rate": 4.938182345585966e-05, |
|
"loss": 1.1729, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.2132, |
|
"grad_norm": 1.5009517669677734, |
|
"learning_rate": 4.937950697014585e-05, |
|
"loss": 1.298, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.2136, |
|
"grad_norm": 1.6571409702301025, |
|
"learning_rate": 4.937718620681273e-05, |
|
"loss": 1.2179, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.214, |
|
"grad_norm": 1.4815192222595215, |
|
"learning_rate": 4.937486116626752e-05, |
|
"loss": 1.175, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 1.3653024435043335, |
|
"learning_rate": 4.9372531848918145e-05, |
|
"loss": 1.123, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2148, |
|
"grad_norm": 1.743571162223816, |
|
"learning_rate": 4.937019825517333e-05, |
|
"loss": 1.224, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.2152, |
|
"grad_norm": 1.633367657661438, |
|
"learning_rate": 4.936786038544251e-05, |
|
"loss": 1.1604, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.2156, |
|
"grad_norm": 1.5236365795135498, |
|
"learning_rate": 4.936551824013589e-05, |
|
"loss": 1.255, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 1.6469926834106445, |
|
"learning_rate": 4.9363171819664434e-05, |
|
"loss": 1.2126, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.2164, |
|
"grad_norm": 1.4393495321273804, |
|
"learning_rate": 4.936082112443983e-05, |
|
"loss": 1.2172, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.2168, |
|
"grad_norm": 1.4337791204452515, |
|
"learning_rate": 4.935846615487453e-05, |
|
"loss": 1.2352, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.2172, |
|
"grad_norm": 1.488175630569458, |
|
"learning_rate": 4.935610691138175e-05, |
|
"loss": 1.1805, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 1.7446844577789307, |
|
"learning_rate": 4.935374339437543e-05, |
|
"loss": 1.1901, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.218, |
|
"grad_norm": 1.6607255935668945, |
|
"learning_rate": 4.935137560427027e-05, |
|
"loss": 1.2369, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.2184, |
|
"grad_norm": 1.4664397239685059, |
|
"learning_rate": 4.934900354148173e-05, |
|
"loss": 1.1617, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.2188, |
|
"grad_norm": 1.5030437707901, |
|
"learning_rate": 4.934662720642601e-05, |
|
"loss": 1.2486, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.2192, |
|
"grad_norm": 1.3806397914886475, |
|
"learning_rate": 4.934424659952006e-05, |
|
"loss": 1.1869, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.2196, |
|
"grad_norm": 1.8397371768951416, |
|
"learning_rate": 4.934186172118157e-05, |
|
"loss": 1.2266, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.6181316375732422, |
|
"learning_rate": 4.933947257182901e-05, |
|
"loss": 1.223, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2204, |
|
"grad_norm": 1.459485650062561, |
|
"learning_rate": 4.933707915188156e-05, |
|
"loss": 1.2411, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 1.567919373512268, |
|
"learning_rate": 4.933468146175918e-05, |
|
"loss": 1.2363, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.2212, |
|
"grad_norm": 1.3925971984863281, |
|
"learning_rate": 4.9332279501882564e-05, |
|
"loss": 1.2627, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.2216, |
|
"grad_norm": 1.4231473207473755, |
|
"learning_rate": 4.932987327267316e-05, |
|
"loss": 1.1886, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.222, |
|
"grad_norm": 1.6479936838150024, |
|
"learning_rate": 4.9327462774553166e-05, |
|
"loss": 1.1865, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.2224, |
|
"grad_norm": 1.5866742134094238, |
|
"learning_rate": 4.9325048007945526e-05, |
|
"loss": 1.1956, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.2228, |
|
"grad_norm": 1.6373844146728516, |
|
"learning_rate": 4.932262897327393e-05, |
|
"loss": 1.1883, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.2232, |
|
"grad_norm": 1.3438984155654907, |
|
"learning_rate": 4.9320205670962814e-05, |
|
"loss": 1.2066, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.2236, |
|
"grad_norm": 1.5680036544799805, |
|
"learning_rate": 4.93177781014374e-05, |
|
"loss": 1.2162, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.7673883438110352, |
|
"learning_rate": 4.9315346265123594e-05, |
|
"loss": 1.1993, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2244, |
|
"grad_norm": 1.8897387981414795, |
|
"learning_rate": 4.93129101624481e-05, |
|
"loss": 1.2517, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.2248, |
|
"grad_norm": 1.4127836227416992, |
|
"learning_rate": 4.931046979383835e-05, |
|
"loss": 1.2182, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.2252, |
|
"grad_norm": 1.5710681676864624, |
|
"learning_rate": 4.930802515972255e-05, |
|
"loss": 1.1595, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.2256, |
|
"grad_norm": 1.7648258209228516, |
|
"learning_rate": 4.9305576260529607e-05, |
|
"loss": 1.2714, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.226, |
|
"grad_norm": 1.628089427947998, |
|
"learning_rate": 4.930312309668922e-05, |
|
"loss": 1.1958, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.2264, |
|
"grad_norm": 1.6491187810897827, |
|
"learning_rate": 4.930066566863182e-05, |
|
"loss": 1.2086, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.2268, |
|
"grad_norm": 1.5382400751113892, |
|
"learning_rate": 4.929820397678858e-05, |
|
"loss": 1.2069, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.929623155367841e-05, |
|
"loss": 1.1623, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.2276, |
|
"grad_norm": 1.6124461889266968, |
|
"learning_rate": 4.929376218810963e-05, |
|
"loss": 1.1087, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 1.5880391597747803, |
|
"learning_rate": 4.9291288559966286e-05, |
|
"loss": 1.1652, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2284, |
|
"grad_norm": 1.4969943761825562, |
|
"learning_rate": 4.928881066968241e-05, |
|
"loss": 1.1668, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.2288, |
|
"grad_norm": 1.5591754913330078, |
|
"learning_rate": 4.928632851769277e-05, |
|
"loss": 1.2072, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.2292, |
|
"grad_norm": 1.5494965314865112, |
|
"learning_rate": 4.9283842104432886e-05, |
|
"loss": 1.225, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.2296, |
|
"grad_norm": 1.4970649480819702, |
|
"learning_rate": 4.928135143033903e-05, |
|
"loss": 1.2435, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5952671766281128, |
|
"learning_rate": 4.9278856495848195e-05, |
|
"loss": 1.1857, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 1.7178370952606201, |
|
"learning_rate": 4.927635730139816e-05, |
|
"loss": 1.2055, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.2308, |
|
"grad_norm": 1.5079060792922974, |
|
"learning_rate": 4.927385384742742e-05, |
|
"loss": 1.2028, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.2312, |
|
"grad_norm": 1.504225730895996, |
|
"learning_rate": 4.927134613437524e-05, |
|
"loss": 1.1848, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.2316, |
|
"grad_norm": 1.7504538297653198, |
|
"learning_rate": 4.9268834162681607e-05, |
|
"loss": 1.1649, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 1.7637404203414917, |
|
"learning_rate": 4.9266317932787294e-05, |
|
"loss": 1.1629, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2324, |
|
"grad_norm": 1.6753672361373901, |
|
"learning_rate": 4.926379744513378e-05, |
|
"loss": 1.252, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.2328, |
|
"grad_norm": 1.7061619758605957, |
|
"learning_rate": 4.9261272700163315e-05, |
|
"loss": 1.2134, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.2332, |
|
"grad_norm": 1.7861872911453247, |
|
"learning_rate": 4.9258743698318886e-05, |
|
"loss": 1.2394, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 1.9671781063079834, |
|
"learning_rate": 4.925621044004424e-05, |
|
"loss": 1.2327, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.234, |
|
"grad_norm": 1.5310274362564087, |
|
"learning_rate": 4.9253672925783846e-05, |
|
"loss": 1.2464, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.2344, |
|
"grad_norm": 1.5653115510940552, |
|
"learning_rate": 4.925113115598295e-05, |
|
"loss": 1.252, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.2348, |
|
"grad_norm": 1.6528226137161255, |
|
"learning_rate": 4.9248585131087524e-05, |
|
"loss": 1.1158, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.2352, |
|
"grad_norm": 1.6093913316726685, |
|
"learning_rate": 4.9246034851544294e-05, |
|
"loss": 1.1969, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.2356, |
|
"grad_norm": 1.875658631324768, |
|
"learning_rate": 4.9243480317800727e-05, |
|
"loss": 1.167, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.236, |
|
"grad_norm": 1.6207784414291382, |
|
"learning_rate": 4.9240921530305043e-05, |
|
"loss": 1.1764, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.2364, |
|
"grad_norm": 1.3430836200714111, |
|
"learning_rate": 4.9238358489506206e-05, |
|
"loss": 1.1961, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 1.6542601585388184, |
|
"learning_rate": 4.9235791195853935e-05, |
|
"loss": 1.178, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.2372, |
|
"grad_norm": 1.4790018796920776, |
|
"learning_rate": 4.9233219649798676e-05, |
|
"loss": 1.1947, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.2376, |
|
"grad_norm": 1.6997478008270264, |
|
"learning_rate": 4.923064385179163e-05, |
|
"loss": 1.1895, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.238, |
|
"grad_norm": 1.570786476135254, |
|
"learning_rate": 4.9228063802284755e-05, |
|
"loss": 1.1722, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.2384, |
|
"grad_norm": 1.4543014764785767, |
|
"learning_rate": 4.922547950173073e-05, |
|
"loss": 1.153, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.2388, |
|
"grad_norm": 1.5781867504119873, |
|
"learning_rate": 4.922289095058301e-05, |
|
"loss": 1.2221, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.2392, |
|
"grad_norm": 1.5818521976470947, |
|
"learning_rate": 4.922029814929577e-05, |
|
"loss": 1.1816, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.2396, |
|
"grad_norm": 1.3536361455917358, |
|
"learning_rate": 4.921770109832396e-05, |
|
"loss": 1.2118, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.9095213413238525, |
|
"learning_rate": 4.9215099798123245e-05, |
|
"loss": 1.1636, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2404, |
|
"grad_norm": 1.721266746520996, |
|
"learning_rate": 4.9212494249150046e-05, |
|
"loss": 1.1963, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.2408, |
|
"grad_norm": 1.4719969034194946, |
|
"learning_rate": 4.920988445186153e-05, |
|
"loss": 1.1835, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.2412, |
|
"grad_norm": 1.5968965291976929, |
|
"learning_rate": 4.920727040671562e-05, |
|
"loss": 1.2169, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.2416, |
|
"grad_norm": 1.3878967761993408, |
|
"learning_rate": 4.920465211417097e-05, |
|
"loss": 1.17, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.242, |
|
"grad_norm": 1.4749703407287598, |
|
"learning_rate": 4.9202029574686986e-05, |
|
"loss": 1.2224, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.2424, |
|
"grad_norm": 1.751265525817871, |
|
"learning_rate": 4.9199402788723817e-05, |
|
"loss": 1.2115, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.2428, |
|
"grad_norm": 1.902597427368164, |
|
"learning_rate": 4.919677175674236e-05, |
|
"loss": 1.2174, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 1.5894381999969482, |
|
"learning_rate": 4.919413647920426e-05, |
|
"loss": 1.1873, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.2436, |
|
"grad_norm": 1.6837565898895264, |
|
"learning_rate": 4.9191496956571886e-05, |
|
"loss": 1.2807, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.244, |
|
"grad_norm": 1.464011788368225, |
|
"learning_rate": 4.9188853189308385e-05, |
|
"loss": 1.1782, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.2444, |
|
"grad_norm": 1.8131499290466309, |
|
"learning_rate": 4.918620517787762e-05, |
|
"loss": 1.1536, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.2448, |
|
"grad_norm": 1.5556432008743286, |
|
"learning_rate": 4.9183552922744204e-05, |
|
"loss": 1.2281, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.2452, |
|
"grad_norm": 1.51846444606781, |
|
"learning_rate": 4.9180896424373526e-05, |
|
"loss": 1.2087, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.2456, |
|
"grad_norm": 1.334592580795288, |
|
"learning_rate": 4.917823568323167e-05, |
|
"loss": 1.1934, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.246, |
|
"grad_norm": 1.6459736824035645, |
|
"learning_rate": 4.9175570699785494e-05, |
|
"loss": 1.2278, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 1.4319044351577759, |
|
"learning_rate": 4.9172901474502606e-05, |
|
"loss": 1.1466, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.2468, |
|
"grad_norm": 1.677406907081604, |
|
"learning_rate": 4.917022800785134e-05, |
|
"loss": 1.1745, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.2472, |
|
"grad_norm": 1.3851629495620728, |
|
"learning_rate": 4.916755030030077e-05, |
|
"loss": 1.1146, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.2476, |
|
"grad_norm": 1.5272161960601807, |
|
"learning_rate": 4.9164868352320746e-05, |
|
"loss": 1.2117, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 1.5623695850372314, |
|
"learning_rate": 4.916218216438183e-05, |
|
"loss": 1.1319, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2484, |
|
"grad_norm": 1.4115298986434937, |
|
"learning_rate": 4.915949173695534e-05, |
|
"loss": 1.1782, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.2488, |
|
"grad_norm": 1.6355260610580444, |
|
"learning_rate": 4.915679707051334e-05, |
|
"loss": 1.2008, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.2492, |
|
"grad_norm": 1.600929856300354, |
|
"learning_rate": 4.915409816552864e-05, |
|
"loss": 1.2382, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 1.574831247329712, |
|
"learning_rate": 4.915139502247478e-05, |
|
"loss": 1.188, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.426468014717102, |
|
"learning_rate": 4.914868764182605e-05, |
|
"loss": 1.0933, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.2504, |
|
"grad_norm": 1.7503633499145508, |
|
"learning_rate": 4.914597602405749e-05, |
|
"loss": 1.1729, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.2508, |
|
"grad_norm": 1.7526756525039673, |
|
"learning_rate": 4.914326016964489e-05, |
|
"loss": 1.1639, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.2512, |
|
"grad_norm": 1.611013412475586, |
|
"learning_rate": 4.9140540079064757e-05, |
|
"loss": 1.193, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.2516, |
|
"grad_norm": 1.6564090251922607, |
|
"learning_rate": 4.913781575279437e-05, |
|
"loss": 1.2075, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 1.5011568069458008, |
|
"learning_rate": 4.913508719131173e-05, |
|
"loss": 1.2225, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2524, |
|
"grad_norm": 1.7857328653335571, |
|
"learning_rate": 4.913235439509559e-05, |
|
"loss": 1.1788, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 1.6372023820877075, |
|
"learning_rate": 4.912961736462545e-05, |
|
"loss": 1.2139, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.2532, |
|
"grad_norm": 1.658705234527588, |
|
"learning_rate": 4.912687610038155e-05, |
|
"loss": 1.1624, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.2536, |
|
"grad_norm": 1.5441612005233765, |
|
"learning_rate": 4.9124130602844856e-05, |
|
"loss": 1.2376, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.254, |
|
"grad_norm": 1.6848093271255493, |
|
"learning_rate": 4.91213808724971e-05, |
|
"loss": 1.1688, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.2544, |
|
"grad_norm": 1.510003924369812, |
|
"learning_rate": 4.911862690982076e-05, |
|
"loss": 1.162, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.2548, |
|
"grad_norm": 1.604901909828186, |
|
"learning_rate": 4.9115868715299044e-05, |
|
"loss": 1.1569, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.2552, |
|
"grad_norm": 1.606873869895935, |
|
"learning_rate": 4.911310628941588e-05, |
|
"loss": 1.2192, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.2556, |
|
"grad_norm": 1.5792936086654663, |
|
"learning_rate": 4.9110339632655986e-05, |
|
"loss": 1.2065, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.4216783046722412, |
|
"learning_rate": 4.9107568745504796e-05, |
|
"loss": 1.1822, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2564, |
|
"grad_norm": 1.6753329038619995, |
|
"learning_rate": 4.910479362844848e-05, |
|
"loss": 1.2008, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.2568, |
|
"grad_norm": 1.7560926675796509, |
|
"learning_rate": 4.9102014281973965e-05, |
|
"loss": 1.2367, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.2572, |
|
"grad_norm": 1.5583642721176147, |
|
"learning_rate": 4.90992307065689e-05, |
|
"loss": 1.0914, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.2576, |
|
"grad_norm": 1.4227246046066284, |
|
"learning_rate": 4.909644290272172e-05, |
|
"loss": 1.1597, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.258, |
|
"grad_norm": 1.5329896211624146, |
|
"learning_rate": 4.909365087092154e-05, |
|
"loss": 1.1979, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.2584, |
|
"grad_norm": 1.5972647666931152, |
|
"learning_rate": 4.909085461165827e-05, |
|
"loss": 1.1652, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.2588, |
|
"grad_norm": 1.2990952730178833, |
|
"learning_rate": 4.9088054125422536e-05, |
|
"loss": 1.1712, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 1.9855566024780273, |
|
"learning_rate": 4.90852494127057e-05, |
|
"loss": 1.1496, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.2596, |
|
"grad_norm": 1.7270143032073975, |
|
"learning_rate": 4.9082440473999894e-05, |
|
"loss": 1.1812, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.8106093406677246, |
|
"learning_rate": 4.9079627309797954e-05, |
|
"loss": 1.1809, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2604, |
|
"grad_norm": 1.5340815782546997, |
|
"learning_rate": 4.907680992059349e-05, |
|
"loss": 1.1683, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.2608, |
|
"grad_norm": 2.0770516395568848, |
|
"learning_rate": 4.9073988306880835e-05, |
|
"loss": 1.2156, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.2612, |
|
"grad_norm": 1.7493189573287964, |
|
"learning_rate": 4.907116246915507e-05, |
|
"loss": 1.1812, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.2616, |
|
"grad_norm": 1.8361352682113647, |
|
"learning_rate": 4.906833240791202e-05, |
|
"loss": 1.1727, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.262, |
|
"grad_norm": 1.3989719152450562, |
|
"learning_rate": 4.906549812364824e-05, |
|
"loss": 1.2014, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 1.319180965423584, |
|
"learning_rate": 4.906265961686103e-05, |
|
"loss": 1.1225, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.2628, |
|
"grad_norm": 1.345543622970581, |
|
"learning_rate": 4.9059816888048435e-05, |
|
"loss": 1.1486, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.2632, |
|
"grad_norm": 1.6714309453964233, |
|
"learning_rate": 4.9056969937709244e-05, |
|
"loss": 1.2246, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.2636, |
|
"grad_norm": 1.8523310422897339, |
|
"learning_rate": 4.905411876634298e-05, |
|
"loss": 1.2018, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 2.405902147293091, |
|
"learning_rate": 4.905126337444991e-05, |
|
"loss": 1.1864, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2644, |
|
"grad_norm": 1.8036085367202759, |
|
"learning_rate": 4.9048403762531034e-05, |
|
"loss": 1.1622, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.2648, |
|
"grad_norm": 1.7046128511428833, |
|
"learning_rate": 4.90455399310881e-05, |
|
"loss": 1.236, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.2652, |
|
"grad_norm": 1.393291711807251, |
|
"learning_rate": 4.90426718806236e-05, |
|
"loss": 1.1616, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 1.621147632598877, |
|
"learning_rate": 4.903979961164076e-05, |
|
"loss": 1.1271, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.266, |
|
"grad_norm": 1.672818899154663, |
|
"learning_rate": 4.903692312464354e-05, |
|
"loss": 1.2407, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.2664, |
|
"grad_norm": 1.5829147100448608, |
|
"learning_rate": 4.9034042420136656e-05, |
|
"loss": 1.2192, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.2668, |
|
"grad_norm": 1.6565492153167725, |
|
"learning_rate": 4.903115749862555e-05, |
|
"loss": 1.1928, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.2672, |
|
"grad_norm": 1.6932697296142578, |
|
"learning_rate": 4.9028268360616415e-05, |
|
"loss": 1.155, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.2676, |
|
"grad_norm": 1.8302545547485352, |
|
"learning_rate": 4.902537500661617e-05, |
|
"loss": 1.2264, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.268, |
|
"grad_norm": 1.6364758014678955, |
|
"learning_rate": 4.9022477437132485e-05, |
|
"loss": 1.2153, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2684, |
|
"grad_norm": 1.8215218782424927, |
|
"learning_rate": 4.9019575652673776e-05, |
|
"loss": 1.1692, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 1.5692654848098755, |
|
"learning_rate": 4.901666965374917e-05, |
|
"loss": 1.2065, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.2692, |
|
"grad_norm": 1.7172425985336304, |
|
"learning_rate": 4.901375944086857e-05, |
|
"loss": 1.1476, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.2696, |
|
"grad_norm": 1.4633954763412476, |
|
"learning_rate": 4.901084501454259e-05, |
|
"loss": 1.1985, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.598120927810669, |
|
"learning_rate": 4.900792637528261e-05, |
|
"loss": 1.1644, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.2704, |
|
"grad_norm": 1.7177152633666992, |
|
"learning_rate": 4.900500352360071e-05, |
|
"loss": 1.1758, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.2708, |
|
"grad_norm": 1.6786895990371704, |
|
"learning_rate": 4.900207646000975e-05, |
|
"loss": 1.2129, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.2712, |
|
"grad_norm": 1.7196831703186035, |
|
"learning_rate": 4.89991451850233e-05, |
|
"loss": 1.2125, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.2716, |
|
"grad_norm": 2.0396034717559814, |
|
"learning_rate": 4.899620969915568e-05, |
|
"loss": 1.2289, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.5949326753616333, |
|
"learning_rate": 4.899327000292197e-05, |
|
"loss": 1.1816, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2724, |
|
"grad_norm": 1.5548696517944336, |
|
"learning_rate": 4.8990326096837945e-05, |
|
"loss": 1.1429, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.2728, |
|
"grad_norm": 1.6374534368515015, |
|
"learning_rate": 4.8987377981420154e-05, |
|
"loss": 1.1942, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.2732, |
|
"grad_norm": 1.7425651550292969, |
|
"learning_rate": 4.898442565718586e-05, |
|
"loss": 1.1463, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.2736, |
|
"grad_norm": 1.5861971378326416, |
|
"learning_rate": 4.898146912465309e-05, |
|
"loss": 1.1314, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.274, |
|
"grad_norm": 1.7639867067337036, |
|
"learning_rate": 4.897850838434059e-05, |
|
"loss": 1.1952, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.2744, |
|
"grad_norm": 1.6477775573730469, |
|
"learning_rate": 4.897554343676785e-05, |
|
"loss": 1.1891, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.2748, |
|
"grad_norm": 1.676500678062439, |
|
"learning_rate": 4.897257428245511e-05, |
|
"loss": 1.1931, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 1.694056749343872, |
|
"learning_rate": 4.896960092192332e-05, |
|
"loss": 1.1777, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.2756, |
|
"grad_norm": 1.711790680885315, |
|
"learning_rate": 4.896662335569419e-05, |
|
"loss": 1.1927, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 1.548268437385559, |
|
"learning_rate": 4.896364158429016e-05, |
|
"loss": 1.1411, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.2764, |
|
"grad_norm": 1.5789943933486938, |
|
"learning_rate": 4.896065560823442e-05, |
|
"loss": 1.2414, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.2768, |
|
"grad_norm": 1.7220913171768188, |
|
"learning_rate": 4.895766542805088e-05, |
|
"loss": 1.1892, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.2772, |
|
"grad_norm": 1.6338775157928467, |
|
"learning_rate": 4.895467104426421e-05, |
|
"loss": 1.1721, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.2776, |
|
"grad_norm": 1.5051183700561523, |
|
"learning_rate": 4.895167245739978e-05, |
|
"loss": 1.1903, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.278, |
|
"grad_norm": 1.6972531080245972, |
|
"learning_rate": 4.894866966798374e-05, |
|
"loss": 1.2405, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 1.515919804573059, |
|
"learning_rate": 4.8945662676542946e-05, |
|
"loss": 1.1592, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.2788, |
|
"grad_norm": 1.4641064405441284, |
|
"learning_rate": 4.894265148360502e-05, |
|
"loss": 1.1888, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.2792, |
|
"grad_norm": 2.053290843963623, |
|
"learning_rate": 4.893963608969828e-05, |
|
"loss": 1.1829, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.2796, |
|
"grad_norm": 1.4748057126998901, |
|
"learning_rate": 4.8936616495351826e-05, |
|
"loss": 1.1498, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6499887704849243, |
|
"learning_rate": 4.8933592701095466e-05, |
|
"loss": 1.2141, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2804, |
|
"grad_norm": 1.6166945695877075, |
|
"learning_rate": 4.8930564707459756e-05, |
|
"loss": 1.1658, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.2808, |
|
"grad_norm": 1.8569326400756836, |
|
"learning_rate": 4.892753251497599e-05, |
|
"loss": 1.2115, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.2812, |
|
"grad_norm": 1.6417903900146484, |
|
"learning_rate": 4.89244961241762e-05, |
|
"loss": 1.1523, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 1.4864392280578613, |
|
"learning_rate": 4.892145553559313e-05, |
|
"loss": 1.1845, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.282, |
|
"grad_norm": 1.427742600440979, |
|
"learning_rate": 4.89184107497603e-05, |
|
"loss": 1.2047, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.2824, |
|
"grad_norm": 1.2292075157165527, |
|
"learning_rate": 4.891536176721194e-05, |
|
"loss": 1.1957, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.2828, |
|
"grad_norm": 1.3831216096878052, |
|
"learning_rate": 4.891230858848302e-05, |
|
"loss": 1.1987, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.2832, |
|
"grad_norm": 1.2832379341125488, |
|
"learning_rate": 4.8909251214109264e-05, |
|
"loss": 1.209, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.2836, |
|
"grad_norm": 1.6963448524475098, |
|
"learning_rate": 4.89061896446271e-05, |
|
"loss": 1.1535, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.284, |
|
"grad_norm": 1.6399435997009277, |
|
"learning_rate": 4.890312388057371e-05, |
|
"loss": 1.1641, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.2844, |
|
"grad_norm": 1.6266658306121826, |
|
"learning_rate": 4.890005392248702e-05, |
|
"loss": 1.1824, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 1.6541725397109985, |
|
"learning_rate": 4.889697977090569e-05, |
|
"loss": 1.1722, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.2852, |
|
"grad_norm": 1.7126740217208862, |
|
"learning_rate": 4.8893901426369097e-05, |
|
"loss": 1.1279, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.2856, |
|
"grad_norm": 1.5441510677337646, |
|
"learning_rate": 4.889081888941738e-05, |
|
"loss": 1.2244, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.286, |
|
"grad_norm": 1.7929022312164307, |
|
"learning_rate": 4.888773216059138e-05, |
|
"loss": 1.1526, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.2864, |
|
"grad_norm": 1.5796688795089722, |
|
"learning_rate": 4.88846412404327e-05, |
|
"loss": 1.1694, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.2868, |
|
"grad_norm": 1.8234366178512573, |
|
"learning_rate": 4.888154612948369e-05, |
|
"loss": 1.2442, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.2872, |
|
"grad_norm": 1.587524652481079, |
|
"learning_rate": 4.887844682828739e-05, |
|
"loss": 1.1758, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.2876, |
|
"grad_norm": 1.4562510251998901, |
|
"learning_rate": 4.887534333738762e-05, |
|
"loss": 1.1729, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.8747830390930176, |
|
"learning_rate": 4.887223565732892e-05, |
|
"loss": 1.1426, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2884, |
|
"grad_norm": 1.7252371311187744, |
|
"learning_rate": 4.886912378865655e-05, |
|
"loss": 1.2046, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.2888, |
|
"grad_norm": 1.5491045713424683, |
|
"learning_rate": 4.8866007731916516e-05, |
|
"loss": 1.2523, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.2892, |
|
"grad_norm": 1.66487455368042, |
|
"learning_rate": 4.886288748765557e-05, |
|
"loss": 1.1938, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.2896, |
|
"grad_norm": 1.7166435718536377, |
|
"learning_rate": 4.885976305642119e-05, |
|
"loss": 1.1753, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.592100977897644, |
|
"learning_rate": 4.885663443876158e-05, |
|
"loss": 1.148, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.2904, |
|
"grad_norm": 1.4008827209472656, |
|
"learning_rate": 4.885350163522568e-05, |
|
"loss": 1.2096, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.2908, |
|
"grad_norm": 1.8386391401290894, |
|
"learning_rate": 4.8850364646363196e-05, |
|
"loss": 1.1847, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 1.4619133472442627, |
|
"learning_rate": 4.884722347272452e-05, |
|
"loss": 1.1385, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.2916, |
|
"grad_norm": 1.4952281713485718, |
|
"learning_rate": 4.88440781148608e-05, |
|
"loss": 1.1568, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.292, |
|
"grad_norm": 1.7998713254928589, |
|
"learning_rate": 4.884092857332393e-05, |
|
"loss": 1.2424, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.2924, |
|
"grad_norm": 1.4490965604782104, |
|
"learning_rate": 4.883777484866653e-05, |
|
"loss": 1.2045, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.2928, |
|
"grad_norm": 1.3473284244537354, |
|
"learning_rate": 4.8834616941441944e-05, |
|
"loss": 1.1838, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.2932, |
|
"grad_norm": 1.6016281843185425, |
|
"learning_rate": 4.883145485220426e-05, |
|
"loss": 1.1829, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.2936, |
|
"grad_norm": 1.5061883926391602, |
|
"learning_rate": 4.88282885815083e-05, |
|
"loss": 1.1409, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.294, |
|
"grad_norm": 1.6041043996810913, |
|
"learning_rate": 4.8825118129909596e-05, |
|
"loss": 1.171, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 1.563822865486145, |
|
"learning_rate": 4.882194349796446e-05, |
|
"loss": 1.229, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.2948, |
|
"grad_norm": 1.5734202861785889, |
|
"learning_rate": 4.8818764686229904e-05, |
|
"loss": 1.2496, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.2952, |
|
"grad_norm": 1.5884230136871338, |
|
"learning_rate": 4.881558169526368e-05, |
|
"loss": 1.2271, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.2956, |
|
"grad_norm": 1.6446903944015503, |
|
"learning_rate": 4.881239452562427e-05, |
|
"loss": 1.196, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 1.6241570711135864, |
|
"learning_rate": 4.880920317787091e-05, |
|
"loss": 1.2396, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.2964, |
|
"grad_norm": 1.3656667470932007, |
|
"learning_rate": 4.8806007652563526e-05, |
|
"loss": 1.2361, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.2968, |
|
"grad_norm": 1.456030011177063, |
|
"learning_rate": 4.880280795026283e-05, |
|
"loss": 1.1698, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.2972, |
|
"grad_norm": 1.6222285032272339, |
|
"learning_rate": 4.879960407153022e-05, |
|
"loss": 1.2176, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.7688349485397339, |
|
"learning_rate": 4.879639601692786e-05, |
|
"loss": 1.1966, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.298, |
|
"grad_norm": 1.7796573638916016, |
|
"learning_rate": 4.879318378701863e-05, |
|
"loss": 1.2249, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.2984, |
|
"grad_norm": 1.565400242805481, |
|
"learning_rate": 4.8789967382366144e-05, |
|
"loss": 1.1471, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.2988, |
|
"grad_norm": 1.5751515626907349, |
|
"learning_rate": 4.8786746803534754e-05, |
|
"loss": 1.1918, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.2992, |
|
"grad_norm": 1.7343270778656006, |
|
"learning_rate": 4.878352205108955e-05, |
|
"loss": 1.209, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.2996, |
|
"grad_norm": 1.8670405149459839, |
|
"learning_rate": 4.878029312559633e-05, |
|
"loss": 1.2088, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.6701018810272217, |
|
"learning_rate": 4.8777060027621644e-05, |
|
"loss": 1.1447, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3004, |
|
"grad_norm": 1.8615483045578003, |
|
"learning_rate": 4.877382275773278e-05, |
|
"loss": 1.2075, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 1.3462893962860107, |
|
"learning_rate": 4.877058131649774e-05, |
|
"loss": 1.13, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3012, |
|
"grad_norm": 1.4814949035644531, |
|
"learning_rate": 4.876733570448527e-05, |
|
"loss": 1.2166, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.3016, |
|
"grad_norm": 1.3817886114120483, |
|
"learning_rate": 4.8764085922264843e-05, |
|
"loss": 1.1571, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.302, |
|
"grad_norm": 1.8147492408752441, |
|
"learning_rate": 4.8760831970406664e-05, |
|
"loss": 1.1623, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.3024, |
|
"grad_norm": 1.6408344507217407, |
|
"learning_rate": 4.875757384948167e-05, |
|
"loss": 1.2238, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.3028, |
|
"grad_norm": 1.627832293510437, |
|
"learning_rate": 4.875431156006152e-05, |
|
"loss": 1.1451, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.3032, |
|
"grad_norm": 1.5649932622909546, |
|
"learning_rate": 4.8751045102718626e-05, |
|
"loss": 1.1593, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.3036, |
|
"grad_norm": 1.6648448705673218, |
|
"learning_rate": 4.8747774478026124e-05, |
|
"loss": 1.1864, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.5145492553710938, |
|
"learning_rate": 4.8744499686557864e-05, |
|
"loss": 1.1968, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3044, |
|
"grad_norm": 1.4245922565460205, |
|
"learning_rate": 4.874122072888845e-05, |
|
"loss": 1.2392, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.3048, |
|
"grad_norm": 1.7010691165924072, |
|
"learning_rate": 4.87379376055932e-05, |
|
"loss": 1.2011, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.3052, |
|
"grad_norm": 1.9582581520080566, |
|
"learning_rate": 4.873465031724816e-05, |
|
"loss": 1.2047, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.3056, |
|
"grad_norm": 1.9121880531311035, |
|
"learning_rate": 4.873135886443014e-05, |
|
"loss": 1.1488, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.306, |
|
"grad_norm": 1.608534812927246, |
|
"learning_rate": 4.8728063247716646e-05, |
|
"loss": 1.1653, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.3064, |
|
"grad_norm": 1.6228535175323486, |
|
"learning_rate": 4.872476346768593e-05, |
|
"loss": 1.1882, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.3068, |
|
"grad_norm": 1.4595372676849365, |
|
"learning_rate": 4.8721459524916955e-05, |
|
"loss": 1.1683, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 1.4573352336883545, |
|
"learning_rate": 4.8718151419989445e-05, |
|
"loss": 1.1298, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3076, |
|
"grad_norm": 1.5250157117843628, |
|
"learning_rate": 4.8714839153483823e-05, |
|
"loss": 1.2049, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.308, |
|
"grad_norm": 1.4276963472366333, |
|
"learning_rate": 4.8711522725981276e-05, |
|
"loss": 1.1412, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.3084, |
|
"grad_norm": 1.514461636543274, |
|
"learning_rate": 4.8708202138063695e-05, |
|
"loss": 1.1392, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.3088, |
|
"grad_norm": 1.6108927726745605, |
|
"learning_rate": 4.8704877390313716e-05, |
|
"loss": 1.201, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.3092, |
|
"grad_norm": 1.9335185289382935, |
|
"learning_rate": 4.870154848331469e-05, |
|
"loss": 1.159, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.3096, |
|
"grad_norm": 1.754297137260437, |
|
"learning_rate": 4.86982154176507e-05, |
|
"loss": 1.1432, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.6593637466430664, |
|
"learning_rate": 4.869487819390657e-05, |
|
"loss": 1.2357, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 1.3409785032272339, |
|
"learning_rate": 4.8691536812667856e-05, |
|
"loss": 1.1105, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3108, |
|
"grad_norm": 1.4268968105316162, |
|
"learning_rate": 4.8688191274520825e-05, |
|
"loss": 1.1857, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.3112, |
|
"grad_norm": 1.5844037532806396, |
|
"learning_rate": 4.8684841580052496e-05, |
|
"loss": 1.1087, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.3116, |
|
"grad_norm": 1.4313236474990845, |
|
"learning_rate": 4.86814877298506e-05, |
|
"loss": 1.2033, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 1.4828399419784546, |
|
"learning_rate": 4.8678129724503584e-05, |
|
"loss": 1.1876, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3124, |
|
"grad_norm": 1.517876148223877, |
|
"learning_rate": 4.867476756460067e-05, |
|
"loss": 1.2316, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.3128, |
|
"grad_norm": 1.5256012678146362, |
|
"learning_rate": 4.867140125073177e-05, |
|
"loss": 1.2264, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.3132, |
|
"grad_norm": 1.4679909944534302, |
|
"learning_rate": 4.866803078348753e-05, |
|
"loss": 1.1672, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 1.3713257312774658, |
|
"learning_rate": 4.866465616345934e-05, |
|
"loss": 1.2037, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.314, |
|
"grad_norm": 1.5898433923721313, |
|
"learning_rate": 4.8661277391239306e-05, |
|
"loss": 1.122, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.3144, |
|
"grad_norm": 1.6622995138168335, |
|
"learning_rate": 4.8657894467420276e-05, |
|
"loss": 1.1442, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.3148, |
|
"grad_norm": 1.6419644355773926, |
|
"learning_rate": 4.865450739259579e-05, |
|
"loss": 1.1647, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.3152, |
|
"grad_norm": 1.4513815641403198, |
|
"learning_rate": 4.865111616736017e-05, |
|
"loss": 1.1515, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.3156, |
|
"grad_norm": 1.7421190738677979, |
|
"learning_rate": 4.8647720792308425e-05, |
|
"loss": 1.2137, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.316, |
|
"grad_norm": 1.4491236209869385, |
|
"learning_rate": 4.8644321268036314e-05, |
|
"loss": 1.1752, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3164, |
|
"grad_norm": 1.8071634769439697, |
|
"learning_rate": 4.8640917595140314e-05, |
|
"loss": 1.1821, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 1.7549405097961426, |
|
"learning_rate": 4.863750977421762e-05, |
|
"loss": 1.2574, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.3172, |
|
"grad_norm": 1.4997444152832031, |
|
"learning_rate": 4.8634097805866185e-05, |
|
"loss": 1.1735, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.3176, |
|
"grad_norm": 1.421034336090088, |
|
"learning_rate": 4.863068169068467e-05, |
|
"loss": 1.1464, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.318, |
|
"grad_norm": 1.5631002187728882, |
|
"learning_rate": 4.862726142927245e-05, |
|
"loss": 1.2239, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.3184, |
|
"grad_norm": 1.3816300630569458, |
|
"learning_rate": 4.8623837022229656e-05, |
|
"loss": 1.2619, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.3188, |
|
"grad_norm": 1.437118649482727, |
|
"learning_rate": 4.862040847015713e-05, |
|
"loss": 1.1265, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.3192, |
|
"grad_norm": 1.6327508687973022, |
|
"learning_rate": 4.8616975773656445e-05, |
|
"loss": 1.1522, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.3196, |
|
"grad_norm": 1.5986108779907227, |
|
"learning_rate": 4.8613538933329906e-05, |
|
"loss": 1.1543, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4712469577789307, |
|
"learning_rate": 4.861009794978053e-05, |
|
"loss": 1.1363, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3204, |
|
"grad_norm": 1.5345813035964966, |
|
"learning_rate": 4.860665282361207e-05, |
|
"loss": 1.2319, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.3208, |
|
"grad_norm": 1.425037145614624, |
|
"learning_rate": 4.860320355542901e-05, |
|
"loss": 1.2544, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.3212, |
|
"grad_norm": 1.7960505485534668, |
|
"learning_rate": 4.859975014583657e-05, |
|
"loss": 1.1738, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.3216, |
|
"grad_norm": 1.6393851041793823, |
|
"learning_rate": 4.8596292595440654e-05, |
|
"loss": 1.154, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.322, |
|
"grad_norm": 1.3774423599243164, |
|
"learning_rate": 4.8592830904847956e-05, |
|
"loss": 1.1987, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.3224, |
|
"grad_norm": 1.455966591835022, |
|
"learning_rate": 4.8589365074665836e-05, |
|
"loss": 1.1879, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.3228, |
|
"grad_norm": 1.541073203086853, |
|
"learning_rate": 4.858589510550243e-05, |
|
"loss": 1.174, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 1.5856127738952637, |
|
"learning_rate": 4.858242099796656e-05, |
|
"loss": 1.2052, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.3236, |
|
"grad_norm": 1.6346274614334106, |
|
"learning_rate": 4.85789427526678e-05, |
|
"loss": 1.1631, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 1.6323809623718262, |
|
"learning_rate": 4.8575460370216444e-05, |
|
"loss": 1.1798, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.3244, |
|
"grad_norm": 1.6759973764419556, |
|
"learning_rate": 4.85719738512235e-05, |
|
"loss": 1.2269, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.3248, |
|
"grad_norm": 1.4718966484069824, |
|
"learning_rate": 4.856848319630072e-05, |
|
"loss": 1.1528, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.3252, |
|
"grad_norm": 1.486632227897644, |
|
"learning_rate": 4.856498840606056e-05, |
|
"loss": 1.1459, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.3256, |
|
"grad_norm": 1.5227134227752686, |
|
"learning_rate": 4.8561489481116234e-05, |
|
"loss": 1.2153, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.326, |
|
"grad_norm": 1.5857396125793457, |
|
"learning_rate": 4.8557986422081654e-05, |
|
"loss": 1.1898, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 1.5316146612167358, |
|
"learning_rate": 4.855447922957146e-05, |
|
"loss": 1.1493, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.3268, |
|
"grad_norm": 1.4671624898910522, |
|
"learning_rate": 4.855096790420103e-05, |
|
"loss": 1.1801, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.3272, |
|
"grad_norm": 1.4884071350097656, |
|
"learning_rate": 4.854745244658645e-05, |
|
"loss": 1.1678, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.3276, |
|
"grad_norm": 1.5596617460250854, |
|
"learning_rate": 4.8543932857344545e-05, |
|
"loss": 1.1672, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 1.524793267250061, |
|
"learning_rate": 4.854040913709287e-05, |
|
"loss": 1.199, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3284, |
|
"grad_norm": 1.5925273895263672, |
|
"learning_rate": 4.8536881286449684e-05, |
|
"loss": 1.148, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.3288, |
|
"grad_norm": 1.3768569231033325, |
|
"learning_rate": 4.853334930603399e-05, |
|
"loss": 1.1311, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.3292, |
|
"grad_norm": 1.8312355279922485, |
|
"learning_rate": 4.852981319646551e-05, |
|
"loss": 1.1435, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 1.4960983991622925, |
|
"learning_rate": 4.8526272958364674e-05, |
|
"loss": 1.2346, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.4610507488250732, |
|
"learning_rate": 4.852272859235267e-05, |
|
"loss": 1.145, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.3304, |
|
"grad_norm": 1.437524437904358, |
|
"learning_rate": 4.8519180099051375e-05, |
|
"loss": 1.1523, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.3308, |
|
"grad_norm": 1.7281017303466797, |
|
"learning_rate": 4.8515627479083415e-05, |
|
"loss": 1.1713, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.3312, |
|
"grad_norm": 1.7618118524551392, |
|
"learning_rate": 4.851207073307213e-05, |
|
"loss": 1.2128, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.3316, |
|
"grad_norm": 1.6273146867752075, |
|
"learning_rate": 4.8508509861641585e-05, |
|
"loss": 1.1961, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.332, |
|
"grad_norm": 1.548926830291748, |
|
"learning_rate": 4.850494486541657e-05, |
|
"loss": 1.2366, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3324, |
|
"grad_norm": 1.465455412864685, |
|
"learning_rate": 4.8501375745022595e-05, |
|
"loss": 1.1614, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 1.7306607961654663, |
|
"learning_rate": 4.8497802501085895e-05, |
|
"loss": 1.2314, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.3332, |
|
"grad_norm": 1.5650298595428467, |
|
"learning_rate": 4.8494225134233434e-05, |
|
"loss": 1.1667, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.3336, |
|
"grad_norm": 1.6374757289886475, |
|
"learning_rate": 4.84906436450929e-05, |
|
"loss": 1.158, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.334, |
|
"grad_norm": 1.8572663068771362, |
|
"learning_rate": 4.848705803429269e-05, |
|
"loss": 1.2126, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.3344, |
|
"grad_norm": 2.0607059001922607, |
|
"learning_rate": 4.8483468302461943e-05, |
|
"loss": 1.256, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.3348, |
|
"grad_norm": 1.6313749551773071, |
|
"learning_rate": 4.84798744502305e-05, |
|
"loss": 1.2197, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.3352, |
|
"grad_norm": 1.4325757026672363, |
|
"learning_rate": 4.847627647822896e-05, |
|
"loss": 1.1577, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.3356, |
|
"grad_norm": 1.4574649333953857, |
|
"learning_rate": 4.847267438708859e-05, |
|
"loss": 1.1881, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.6589447259902954, |
|
"learning_rate": 4.8469068177441435e-05, |
|
"loss": 1.1362, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.3364, |
|
"grad_norm": 1.4486385583877563, |
|
"learning_rate": 4.8465457849920226e-05, |
|
"loss": 1.1839, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.3368, |
|
"grad_norm": 1.6947062015533447, |
|
"learning_rate": 4.8461843405158446e-05, |
|
"loss": 1.1522, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.3372, |
|
"grad_norm": 2.0725297927856445, |
|
"learning_rate": 4.845822484379027e-05, |
|
"loss": 1.1357, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.3376, |
|
"grad_norm": 1.6981180906295776, |
|
"learning_rate": 4.8454602166450615e-05, |
|
"loss": 1.2319, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.338, |
|
"grad_norm": 1.5875322818756104, |
|
"learning_rate": 4.8450975373775107e-05, |
|
"loss": 1.167, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.3384, |
|
"grad_norm": 1.6823283433914185, |
|
"learning_rate": 4.844734446640011e-05, |
|
"loss": 1.1926, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.3388, |
|
"grad_norm": 1.5389896631240845, |
|
"learning_rate": 4.844370944496271e-05, |
|
"loss": 1.1144, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 1.5896904468536377, |
|
"learning_rate": 4.844007031010068e-05, |
|
"loss": 1.1776, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.3396, |
|
"grad_norm": 1.4404852390289307, |
|
"learning_rate": 4.843642706245257e-05, |
|
"loss": 1.2146, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.3411272764205933, |
|
"learning_rate": 4.84327797026576e-05, |
|
"loss": 1.1603, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3404, |
|
"grad_norm": 1.722702980041504, |
|
"learning_rate": 4.842912823135575e-05, |
|
"loss": 1.2679, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.3408, |
|
"grad_norm": 1.692683219909668, |
|
"learning_rate": 4.84254726491877e-05, |
|
"loss": 1.1717, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.3412, |
|
"grad_norm": 1.814393162727356, |
|
"learning_rate": 4.842181295679486e-05, |
|
"loss": 1.1264, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.3416, |
|
"grad_norm": 1.421974539756775, |
|
"learning_rate": 4.841814915481935e-05, |
|
"loss": 1.1899, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.342, |
|
"grad_norm": 1.519344449043274, |
|
"learning_rate": 4.841448124390403e-05, |
|
"loss": 1.2029, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 1.3807154893875122, |
|
"learning_rate": 4.841080922469247e-05, |
|
"loss": 1.1751, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.3428, |
|
"grad_norm": 1.5611778497695923, |
|
"learning_rate": 4.840713309782895e-05, |
|
"loss": 1.1801, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.3432, |
|
"grad_norm": 1.7587612867355347, |
|
"learning_rate": 4.840345286395849e-05, |
|
"loss": 1.1458, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.3436, |
|
"grad_norm": 2.1552114486694336, |
|
"learning_rate": 4.839976852372682e-05, |
|
"loss": 1.1537, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 1.483056902885437, |
|
"learning_rate": 4.8396080077780394e-05, |
|
"loss": 1.1974, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3444, |
|
"grad_norm": 1.3012534379959106, |
|
"learning_rate": 4.839238752676638e-05, |
|
"loss": 1.2229, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.3448, |
|
"grad_norm": 1.6024060249328613, |
|
"learning_rate": 4.838869087133269e-05, |
|
"loss": 1.1466, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.3452, |
|
"grad_norm": 2.0319061279296875, |
|
"learning_rate": 4.8384990112127915e-05, |
|
"loss": 1.1838, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 1.35572350025177, |
|
"learning_rate": 4.83812852498014e-05, |
|
"loss": 1.1846, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.346, |
|
"grad_norm": 1.7080681324005127, |
|
"learning_rate": 4.83775762850032e-05, |
|
"loss": 1.151, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.3464, |
|
"grad_norm": 1.3965811729431152, |
|
"learning_rate": 4.837386321838408e-05, |
|
"loss": 1.3165, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.3468, |
|
"grad_norm": 1.673601746559143, |
|
"learning_rate": 4.837014605059555e-05, |
|
"loss": 1.2241, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.3472, |
|
"grad_norm": 1.2754968404769897, |
|
"learning_rate": 4.83664247822898e-05, |
|
"loss": 1.1173, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.3476, |
|
"grad_norm": 1.8693870306015015, |
|
"learning_rate": 4.836269941411978e-05, |
|
"loss": 1.1775, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 1.5695033073425293, |
|
"learning_rate": 4.835896994673914e-05, |
|
"loss": 1.1773, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3484, |
|
"grad_norm": 1.314810037612915, |
|
"learning_rate": 4.835523638080224e-05, |
|
"loss": 1.2331, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 1.2816500663757324, |
|
"learning_rate": 4.835149871696418e-05, |
|
"loss": 1.1959, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.3492, |
|
"grad_norm": 1.5477250814437866, |
|
"learning_rate": 4.8347756955880765e-05, |
|
"loss": 1.2451, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.3496, |
|
"grad_norm": 1.6063194274902344, |
|
"learning_rate": 4.834401109820853e-05, |
|
"loss": 1.1905, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.7513134479522705, |
|
"learning_rate": 4.834026114460471e-05, |
|
"loss": 1.2092, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.3504, |
|
"grad_norm": 1.578026533126831, |
|
"learning_rate": 4.833650709572728e-05, |
|
"loss": 1.1645, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.3508, |
|
"grad_norm": 1.597808599472046, |
|
"learning_rate": 4.833274895223492e-05, |
|
"loss": 1.2233, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.3512, |
|
"grad_norm": 1.4159451723098755, |
|
"learning_rate": 4.8328986714787034e-05, |
|
"loss": 1.1685, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.3516, |
|
"grad_norm": 1.6277220249176025, |
|
"learning_rate": 4.832522038404374e-05, |
|
"loss": 1.2059, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.6037828922271729, |
|
"learning_rate": 4.832144996066588e-05, |
|
"loss": 1.1512, |
|
"step": 4400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 37500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 6.908065048162468e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|