|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997726239199636, |
|
"eval_steps": 500, |
|
"global_step": 3708, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008084482845737962, |
|
"grad_norm": 10.217898134986712, |
|
"learning_rate": 1.7857142857142858e-07, |
|
"loss": 1.7929, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008084482845737961, |
|
"grad_norm": 8.447331849071613, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 1.8062, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016168965691475922, |
|
"grad_norm": 3.3131831113560053, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 1.6026, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.024253448537213885, |
|
"grad_norm": 1.9123288015392683, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 1.5044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.032337931382951844, |
|
"grad_norm": 1.5061470360359468, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.46, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04042241422868981, |
|
"grad_norm": 1.3730571252951111, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 1.4242, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04850689707442777, |
|
"grad_norm": 1.3696793978053872, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 1.3962, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05659137992016573, |
|
"grad_norm": 1.3254100387294059, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.4019, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06467586276590369, |
|
"grad_norm": 1.300617283886077, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 1.38, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07276034561164166, |
|
"grad_norm": 1.2890020177179862, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 1.3692, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08084482845737961, |
|
"grad_norm": 1.3617464980600442, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 1.3717, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08892931130311758, |
|
"grad_norm": 1.3364658238171299, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 1.3445, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09701379414885554, |
|
"grad_norm": 1.406654933905743, |
|
"learning_rate": 1.999975576461237e-05, |
|
"loss": 1.3724, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1050982769945935, |
|
"grad_norm": 1.4611284363694028, |
|
"learning_rate": 1.999876357879684e-05, |
|
"loss": 1.3446, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11318275984033146, |
|
"grad_norm": 1.3711888479566774, |
|
"learning_rate": 1.9997008253510416e-05, |
|
"loss": 1.3515, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12126724268606942, |
|
"grad_norm": 1.409811295994504, |
|
"learning_rate": 1.9994489922725454e-05, |
|
"loss": 1.342, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12935172553180738, |
|
"grad_norm": 1.3918093353079757, |
|
"learning_rate": 1.9991208778649485e-05, |
|
"loss": 1.3493, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13743620837754536, |
|
"grad_norm": 1.3597096387911873, |
|
"learning_rate": 1.998716507171053e-05, |
|
"loss": 1.3186, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14552069122328332, |
|
"grad_norm": 1.3532852407233782, |
|
"learning_rate": 1.998235911053798e-05, |
|
"loss": 1.3426, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15360517406902127, |
|
"grad_norm": 1.395929355438092, |
|
"learning_rate": 1.9976791261939064e-05, |
|
"loss": 1.338, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16168965691475923, |
|
"grad_norm": 1.3346897807354206, |
|
"learning_rate": 1.997046195087082e-05, |
|
"loss": 1.3209, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16977413976049718, |
|
"grad_norm": 1.3501622187451188, |
|
"learning_rate": 1.996337166040769e-05, |
|
"loss": 1.3279, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17785862260623517, |
|
"grad_norm": 1.262079121683393, |
|
"learning_rate": 1.995552093170463e-05, |
|
"loss": 1.3135, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18594310545197312, |
|
"grad_norm": 1.324883144834464, |
|
"learning_rate": 1.994691036395583e-05, |
|
"loss": 1.306, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19402758829771108, |
|
"grad_norm": 1.373867783740766, |
|
"learning_rate": 1.9937540614348944e-05, |
|
"loss": 1.3018, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.20211207114344903, |
|
"grad_norm": 1.4020861161362783, |
|
"learning_rate": 1.992741239801498e-05, |
|
"loss": 1.3203, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.210196553989187, |
|
"grad_norm": 1.3484650757297245, |
|
"learning_rate": 1.9916526487973678e-05, |
|
"loss": 1.2939, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.21828103683492497, |
|
"grad_norm": 1.3330965331306333, |
|
"learning_rate": 1.9904883715074525e-05, |
|
"loss": 1.2795, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22636551968066293, |
|
"grad_norm": 1.3917589397233552, |
|
"learning_rate": 1.989248496793335e-05, |
|
"loss": 1.269, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.23445000252640089, |
|
"grad_norm": 1.3905412148367542, |
|
"learning_rate": 1.9879331192864492e-05, |
|
"loss": 1.286, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24253448537213884, |
|
"grad_norm": 1.4569325197708967, |
|
"learning_rate": 1.9865423393808573e-05, |
|
"loss": 1.2944, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2506189682178768, |
|
"grad_norm": 1.3399495909594208, |
|
"learning_rate": 1.985076263225588e-05, |
|
"loss": 1.3106, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.25870345106361475, |
|
"grad_norm": 1.478802579336813, |
|
"learning_rate": 1.9835350027165342e-05, |
|
"loss": 1.2994, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.26678793390935274, |
|
"grad_norm": 1.3105244819439577, |
|
"learning_rate": 1.9819186754879137e-05, |
|
"loss": 1.2871, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2748724167550907, |
|
"grad_norm": 1.3667119896120177, |
|
"learning_rate": 1.9802274049032898e-05, |
|
"loss": 1.2893, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.28295689960082865, |
|
"grad_norm": 1.5054526064910085, |
|
"learning_rate": 1.9784613200461568e-05, |
|
"loss": 1.2912, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.29104138244656663, |
|
"grad_norm": 1.3163243486049039, |
|
"learning_rate": 1.976620555710087e-05, |
|
"loss": 1.2761, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.29912586529230456, |
|
"grad_norm": 1.322920539242633, |
|
"learning_rate": 1.9747052523884435e-05, |
|
"loss": 1.2572, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.30721034813804254, |
|
"grad_norm": 1.3954468357326724, |
|
"learning_rate": 1.972715556263657e-05, |
|
"loss": 1.2745, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3152948309837805, |
|
"grad_norm": 1.3451929159695755, |
|
"learning_rate": 1.9706516191960687e-05, |
|
"loss": 1.2472, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.32337931382951846, |
|
"grad_norm": 1.2765565775996142, |
|
"learning_rate": 1.9685135987123396e-05, |
|
"loss": 1.255, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.33146379667525644, |
|
"grad_norm": 1.4632541877317655, |
|
"learning_rate": 1.966301657993428e-05, |
|
"loss": 1.2565, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.33954827952099437, |
|
"grad_norm": 1.3554436136314076, |
|
"learning_rate": 1.9640159658621344e-05, |
|
"loss": 1.2593, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.34763276236673235, |
|
"grad_norm": 1.3154961346767526, |
|
"learning_rate": 1.9616566967702164e-05, |
|
"loss": 1.2604, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.35571724521247033, |
|
"grad_norm": 1.3833700211512812, |
|
"learning_rate": 1.9592240307850748e-05, |
|
"loss": 1.2625, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.36380172805820826, |
|
"grad_norm": 1.2812641775550833, |
|
"learning_rate": 1.95671815357601e-05, |
|
"loss": 1.2661, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.37188621090394625, |
|
"grad_norm": 1.3509908047727408, |
|
"learning_rate": 1.954139256400049e-05, |
|
"loss": 1.2448, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3799706937496842, |
|
"grad_norm": 1.356891388574271, |
|
"learning_rate": 1.951487536087352e-05, |
|
"loss": 1.2551, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.38805517659542216, |
|
"grad_norm": 1.2921423460134738, |
|
"learning_rate": 1.948763195026186e-05, |
|
"loss": 1.2503, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.39613965944116014, |
|
"grad_norm": 1.3494188641362486, |
|
"learning_rate": 1.9459664411474793e-05, |
|
"loss": 1.2509, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.40422414228689807, |
|
"grad_norm": 1.336605272931222, |
|
"learning_rate": 1.9430974879089522e-05, |
|
"loss": 1.251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.41230862513263605, |
|
"grad_norm": 1.3167568815144604, |
|
"learning_rate": 1.9401565542788238e-05, |
|
"loss": 1.2341, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.420393107978374, |
|
"grad_norm": 1.3704112316871029, |
|
"learning_rate": 1.9371438647191007e-05, |
|
"loss": 1.2483, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.42847759082411196, |
|
"grad_norm": 1.2971253486447214, |
|
"learning_rate": 1.9340596491684443e-05, |
|
"loss": 1.2483, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.43656207366984995, |
|
"grad_norm": 1.278671915851734, |
|
"learning_rate": 1.9309041430246228e-05, |
|
"loss": 1.247, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4446465565155879, |
|
"grad_norm": 1.7143062654632688, |
|
"learning_rate": 1.927677587126542e-05, |
|
"loss": 1.2582, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.45273103936132586, |
|
"grad_norm": 3.071400396021207, |
|
"learning_rate": 1.924380227735867e-05, |
|
"loss": 1.2369, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.46081552220706384, |
|
"grad_norm": 1.3043426791795303, |
|
"learning_rate": 1.921012316518224e-05, |
|
"loss": 1.2564, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.46890000505280177, |
|
"grad_norm": 1.4606599266501914, |
|
"learning_rate": 1.917574110523994e-05, |
|
"loss": 1.2455, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.47698448789853976, |
|
"grad_norm": 1.4084512281248918, |
|
"learning_rate": 1.914065872168692e-05, |
|
"loss": 1.237, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4850689707442777, |
|
"grad_norm": 1.7442614101619536, |
|
"learning_rate": 1.910487869212942e-05, |
|
"loss": 1.2428, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.49315345359001567, |
|
"grad_norm": 3.217405569169141, |
|
"learning_rate": 1.9068403747420365e-05, |
|
"loss": 1.2406, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5012379364357537, |
|
"grad_norm": 2.028319411000034, |
|
"learning_rate": 1.9031236671450963e-05, |
|
"loss": 1.2295, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5093224192814916, |
|
"grad_norm": 1.3534016099705948, |
|
"learning_rate": 1.899338030093822e-05, |
|
"loss": 1.2287, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5174069021272295, |
|
"grad_norm": 16.560363141349324, |
|
"learning_rate": 1.8954837525208432e-05, |
|
"loss": 1.2239, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5254913849729675, |
|
"grad_norm": 1.6507428977016882, |
|
"learning_rate": 1.8915611285976672e-05, |
|
"loss": 1.2122, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5335758678187055, |
|
"grad_norm": 1.4599892786481696, |
|
"learning_rate": 1.887570457712225e-05, |
|
"loss": 1.2448, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5416603506644434, |
|
"grad_norm": 1.3576328654723246, |
|
"learning_rate": 1.883512044446023e-05, |
|
"loss": 1.235, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5497448335101814, |
|
"grad_norm": 2.7275818350144383, |
|
"learning_rate": 1.879386198550895e-05, |
|
"loss": 1.2302, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5578293163559194, |
|
"grad_norm": 1.4783688191374078, |
|
"learning_rate": 1.8751932349253595e-05, |
|
"loss": 1.2183, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5659137992016573, |
|
"grad_norm": 1.3696848099680126, |
|
"learning_rate": 1.8709334735905908e-05, |
|
"loss": 1.2202, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5739982820473952, |
|
"grad_norm": 1.3843064222445587, |
|
"learning_rate": 1.866607239665988e-05, |
|
"loss": 1.2292, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5820827648931333, |
|
"grad_norm": 1.3013446345815274, |
|
"learning_rate": 1.8622148633443626e-05, |
|
"loss": 1.2404, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5901672477388712, |
|
"grad_norm": 1.3389494076775972, |
|
"learning_rate": 1.8577566798667397e-05, |
|
"loss": 1.2, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5982517305846091, |
|
"grad_norm": 1.2803553653933784, |
|
"learning_rate": 1.8532330294967678e-05, |
|
"loss": 1.2019, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6063362134303472, |
|
"grad_norm": 1.3940783442430897, |
|
"learning_rate": 1.848644257494751e-05, |
|
"loss": 1.2111, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6144206962760851, |
|
"grad_norm": 1.2967372912925752, |
|
"learning_rate": 1.8439907140912962e-05, |
|
"loss": 1.2044, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.622505179121823, |
|
"grad_norm": 1.307050777866234, |
|
"learning_rate": 1.839272754460583e-05, |
|
"loss": 1.211, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.630589661967561, |
|
"grad_norm": 1.7851865803650349, |
|
"learning_rate": 1.8344907386932552e-05, |
|
"loss": 1.2038, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.638674144813299, |
|
"grad_norm": 1.8614266164299924, |
|
"learning_rate": 1.8296450317689377e-05, |
|
"loss": 1.2054, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6467586276590369, |
|
"grad_norm": 1.3262638540650757, |
|
"learning_rate": 1.824736003528381e-05, |
|
"loss": 1.209, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.654843110504775, |
|
"grad_norm": 1.290793353111858, |
|
"learning_rate": 1.8197640286452312e-05, |
|
"loss": 1.213, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6629275933505129, |
|
"grad_norm": 1.2558226934999566, |
|
"learning_rate": 1.814729486597436e-05, |
|
"loss": 1.2266, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6710120761962508, |
|
"grad_norm": 1.277465841944589, |
|
"learning_rate": 1.8096327616382815e-05, |
|
"loss": 1.2167, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6790965590419887, |
|
"grad_norm": 1.298887855615747, |
|
"learning_rate": 1.8044742427670627e-05, |
|
"loss": 1.2226, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6871810418877268, |
|
"grad_norm": 5.857168222574854, |
|
"learning_rate": 1.7992543236993952e-05, |
|
"loss": 1.2027, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6952655247334647, |
|
"grad_norm": 1.3361306728189393, |
|
"learning_rate": 1.7939734028371663e-05, |
|
"loss": 1.207, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7033500075792026, |
|
"grad_norm": 1.3969769044659528, |
|
"learning_rate": 1.7886318832381264e-05, |
|
"loss": 1.1799, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7114344904249407, |
|
"grad_norm": 1.4266930108547686, |
|
"learning_rate": 1.783230172585126e-05, |
|
"loss": 1.2111, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7195189732706786, |
|
"grad_norm": 1.3440902999919684, |
|
"learning_rate": 1.7777686831550008e-05, |
|
"loss": 1.1854, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7276034561164165, |
|
"grad_norm": 1.251718689797153, |
|
"learning_rate": 1.7722478317871053e-05, |
|
"loss": 1.1803, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7356879389621546, |
|
"grad_norm": 1.2756808323680056, |
|
"learning_rate": 1.7666680398514978e-05, |
|
"loss": 1.2148, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7437724218078925, |
|
"grad_norm": 1.3774590120848857, |
|
"learning_rate": 1.76102973321678e-05, |
|
"loss": 1.189, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7518569046536304, |
|
"grad_norm": 1.5207360711907143, |
|
"learning_rate": 1.7553333422175933e-05, |
|
"loss": 1.1819, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7599413874993683, |
|
"grad_norm": 1.302009300658742, |
|
"learning_rate": 1.7495793016217754e-05, |
|
"loss": 1.191, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7680258703451064, |
|
"grad_norm": 1.3859954985668783, |
|
"learning_rate": 1.743768050597175e-05, |
|
"loss": 1.1835, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7761103531908443, |
|
"grad_norm": 1.3435502591474426, |
|
"learning_rate": 1.7379000326781348e-05, |
|
"loss": 1.2035, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7841948360365822, |
|
"grad_norm": 1.38981939520544, |
|
"learning_rate": 1.7319756957316392e-05, |
|
"loss": 1.1887, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7922793188823203, |
|
"grad_norm": 1.4015519572670776, |
|
"learning_rate": 1.725995491923131e-05, |
|
"loss": 1.1843, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8003638017280582, |
|
"grad_norm": 1.4763071143801054, |
|
"learning_rate": 1.7199598776820013e-05, |
|
"loss": 1.1753, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8084482845737961, |
|
"grad_norm": 1.3577477544239007, |
|
"learning_rate": 1.713869313666753e-05, |
|
"loss": 1.1966, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8165327674195342, |
|
"grad_norm": 1.3963231420568032, |
|
"learning_rate": 1.7077242647298405e-05, |
|
"loss": 1.1985, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8246172502652721, |
|
"grad_norm": 1.5498623314696613, |
|
"learning_rate": 1.7015251998821938e-05, |
|
"loss": 1.1785, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.83270173311101, |
|
"grad_norm": 1.3586468512222978, |
|
"learning_rate": 1.6952725922574188e-05, |
|
"loss": 1.1648, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.840786215956748, |
|
"grad_norm": 1.4300342736321576, |
|
"learning_rate": 1.688966919075687e-05, |
|
"loss": 1.1666, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.848870698802486, |
|
"grad_norm": 1.5788283624417567, |
|
"learning_rate": 1.682608661607313e-05, |
|
"loss": 1.1821, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8569551816482239, |
|
"grad_norm": 1.359570582214726, |
|
"learning_rate": 1.6761983051360232e-05, |
|
"loss": 1.1958, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8650396644939619, |
|
"grad_norm": 1.3046392847858388, |
|
"learning_rate": 1.6697363389219147e-05, |
|
"loss": 1.1557, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8731241473396999, |
|
"grad_norm": 1.4677129965264875, |
|
"learning_rate": 1.6632232561641158e-05, |
|
"loss": 1.1593, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8812086301854378, |
|
"grad_norm": 1.4859252531152671, |
|
"learning_rate": 1.6566595539631417e-05, |
|
"loss": 1.1753, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8892931130311758, |
|
"grad_norm": 1.3209365154297203, |
|
"learning_rate": 1.6500457332829553e-05, |
|
"loss": 1.161, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8973775958769138, |
|
"grad_norm": 1.3862159117294945, |
|
"learning_rate": 1.6433822989127314e-05, |
|
"loss": 1.1592, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9054620787226517, |
|
"grad_norm": 1.4456179949854164, |
|
"learning_rate": 1.636669759428329e-05, |
|
"loss": 1.1484, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9135465615683896, |
|
"grad_norm": 1.288756152636894, |
|
"learning_rate": 1.6299086271534764e-05, |
|
"loss": 1.181, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9216310444141277, |
|
"grad_norm": 1.2599229391965052, |
|
"learning_rate": 1.6230994181206674e-05, |
|
"loss": 1.1718, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9297155272598656, |
|
"grad_norm": 1.4973902946133841, |
|
"learning_rate": 1.6162426520317765e-05, |
|
"loss": 1.1773, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9378000101056035, |
|
"grad_norm": 1.3698767908727083, |
|
"learning_rate": 1.6093388522183948e-05, |
|
"loss": 1.1666, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9458844929513415, |
|
"grad_norm": 1.386433062647111, |
|
"learning_rate": 1.6023885456018852e-05, |
|
"loss": 1.1859, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9539689757970795, |
|
"grad_norm": 1.284904254015402, |
|
"learning_rate": 1.595392262653168e-05, |
|
"loss": 1.1906, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9620534586428174, |
|
"grad_norm": 1.4402131637475677, |
|
"learning_rate": 1.5883505373522317e-05, |
|
"loss": 1.1593, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.9701379414885554, |
|
"grad_norm": 1.6049356540049453, |
|
"learning_rate": 1.5812639071473804e-05, |
|
"loss": 1.1636, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9782224243342934, |
|
"grad_norm": 1.505036374645861, |
|
"learning_rate": 1.574132912914211e-05, |
|
"loss": 1.14, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9863069071800313, |
|
"grad_norm": 1.6280895974825729, |
|
"learning_rate": 1.566958098914334e-05, |
|
"loss": 1.1358, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9943913900257693, |
|
"grad_norm": 1.2574161457807662, |
|
"learning_rate": 1.5597400127538324e-05, |
|
"loss": 1.1754, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9992420797332121, |
|
"eval_loss": 1.0555766820907593, |
|
"eval_runtime": 476.758, |
|
"eval_samples_per_second": 25.514, |
|
"eval_steps_per_second": 12.757, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 1.0024758728715073, |
|
"grad_norm": 2.9356360899500897, |
|
"learning_rate": 1.5524792053414676e-05, |
|
"loss": 1.1182, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0105603557172451, |
|
"grad_norm": 1.4115997260524025, |
|
"learning_rate": 1.5451762308466302e-05, |
|
"loss": 1.0448, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0186448385629832, |
|
"grad_norm": 1.4408354404654395, |
|
"learning_rate": 1.5378316466570466e-05, |
|
"loss": 1.027, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0267293214087212, |
|
"grad_norm": 1.40209737150782, |
|
"learning_rate": 1.530446013336235e-05, |
|
"loss": 1.0253, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.034813804254459, |
|
"grad_norm": 1.4050923085204698, |
|
"learning_rate": 1.5230198945807226e-05, |
|
"loss": 1.0596, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.042898287100197, |
|
"grad_norm": 1.3850604464116953, |
|
"learning_rate": 1.515553857177022e-05, |
|
"loss": 1.0354, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.050982769945935, |
|
"grad_norm": 1.6192982769908866, |
|
"learning_rate": 1.5080484709583715e-05, |
|
"loss": 1.0338, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.059067252791673, |
|
"grad_norm": 1.5443333411983042, |
|
"learning_rate": 1.5005043087612452e-05, |
|
"loss": 1.0224, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.067151735637411, |
|
"grad_norm": 1.4795375887873081, |
|
"learning_rate": 1.4929219463816302e-05, |
|
"loss": 1.0273, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.075236218483149, |
|
"grad_norm": 1.3952469643942318, |
|
"learning_rate": 1.4853019625310813e-05, |
|
"loss": 1.0165, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.0833207013288868, |
|
"grad_norm": 1.4102438583126526, |
|
"learning_rate": 1.4776449387925507e-05, |
|
"loss": 1.0323, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.0914051841746248, |
|
"grad_norm": 1.4166513317270177, |
|
"learning_rate": 1.4699514595760006e-05, |
|
"loss": 1.0343, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.0994896670203629, |
|
"grad_norm": 1.4572773218335806, |
|
"learning_rate": 1.4622221120737985e-05, |
|
"loss": 1.0449, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.1075741498661007, |
|
"grad_norm": 1.4277575864922984, |
|
"learning_rate": 1.4544574862159013e-05, |
|
"loss": 1.0157, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.1156586327118387, |
|
"grad_norm": 1.8246683293221693, |
|
"learning_rate": 1.446658174624829e-05, |
|
"loss": 1.037, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1237431155575768, |
|
"grad_norm": 1.4515508954548648, |
|
"learning_rate": 1.4388247725704338e-05, |
|
"loss": 1.0163, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.1318275984033146, |
|
"grad_norm": 1.4472625641065484, |
|
"learning_rate": 1.4309578779244678e-05, |
|
"loss": 1.0339, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1399120812490526, |
|
"grad_norm": 1.441284439472294, |
|
"learning_rate": 1.423058091114951e-05, |
|
"loss": 1.0153, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1479965640947905, |
|
"grad_norm": 1.4505444065925723, |
|
"learning_rate": 1.4151260150803445e-05, |
|
"loss": 1.0413, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.1560810469405285, |
|
"grad_norm": 1.5566575848024742, |
|
"learning_rate": 1.4071622552235327e-05, |
|
"loss": 1.014, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.1641655297862665, |
|
"grad_norm": 1.476527456836737, |
|
"learning_rate": 1.399167419365616e-05, |
|
"loss": 1.0374, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.1722500126320043, |
|
"grad_norm": 1.7587555981022083, |
|
"learning_rate": 1.3911421176995206e-05, |
|
"loss": 1.0145, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1803344954777424, |
|
"grad_norm": 1.5447530212974045, |
|
"learning_rate": 1.3830869627434267e-05, |
|
"loss": 1.0104, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.1884189783234804, |
|
"grad_norm": 1.368002967716879, |
|
"learning_rate": 1.3750025692940174e-05, |
|
"loss": 1.0102, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.1965034611692182, |
|
"grad_norm": 1.5132346329088506, |
|
"learning_rate": 1.3668895543795581e-05, |
|
"loss": 1.0241, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2045879440149563, |
|
"grad_norm": 1.4535090384504317, |
|
"learning_rate": 1.3587485372128e-05, |
|
"loss": 1.01, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2126724268606943, |
|
"grad_norm": 1.6349536867702466, |
|
"learning_rate": 1.3505801391437215e-05, |
|
"loss": 1.0538, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2207569097064321, |
|
"grad_norm": 1.608679365926187, |
|
"learning_rate": 1.3423849836121043e-05, |
|
"loss": 1.0256, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.2288413925521702, |
|
"grad_norm": 1.4875509565909706, |
|
"learning_rate": 1.33416369609995e-05, |
|
"loss": 1.0365, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.2369258753979082, |
|
"grad_norm": 1.4161399144655036, |
|
"learning_rate": 1.325916904083741e-05, |
|
"loss": 1.0285, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.245010358243646, |
|
"grad_norm": 1.516547180031239, |
|
"learning_rate": 1.3176452369865504e-05, |
|
"loss": 0.9972, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.253094841089384, |
|
"grad_norm": 1.4500310981963098, |
|
"learning_rate": 1.3093493261300012e-05, |
|
"loss": 1.0122, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.261179323935122, |
|
"grad_norm": 1.3787551364346502, |
|
"learning_rate": 1.3010298046860821e-05, |
|
"loss": 1.0221, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.26926380678086, |
|
"grad_norm": 1.3579456863416077, |
|
"learning_rate": 1.2926873076288222e-05, |
|
"loss": 1.0213, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.277348289626598, |
|
"grad_norm": 1.4774509503134268, |
|
"learning_rate": 1.2843224716858271e-05, |
|
"loss": 1.012, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.285432772472336, |
|
"grad_norm": 1.4805342986177266, |
|
"learning_rate": 1.2759359352896809e-05, |
|
"loss": 1.0193, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.2935172553180738, |
|
"grad_norm": 1.4527468028008124, |
|
"learning_rate": 1.2675283385292212e-05, |
|
"loss": 1.0431, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3016017381638119, |
|
"grad_norm": 1.5688075844044822, |
|
"learning_rate": 1.259100323100682e-05, |
|
"loss": 1.0226, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.30968622100955, |
|
"grad_norm": 1.493324687221304, |
|
"learning_rate": 1.2506525322587207e-05, |
|
"loss": 0.9966, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3177707038552877, |
|
"grad_norm": 1.563824009098089, |
|
"learning_rate": 1.2421856107673205e-05, |
|
"loss": 1.0317, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.3258551867010258, |
|
"grad_norm": 1.4698666764020467, |
|
"learning_rate": 1.233700204850581e-05, |
|
"loss": 1.0013, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.3339396695467638, |
|
"grad_norm": 1.625463847709757, |
|
"learning_rate": 1.2251969621433947e-05, |
|
"loss": 1.0233, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3420241523925016, |
|
"grad_norm": 1.560576858468798, |
|
"learning_rate": 1.2166765316420195e-05, |
|
"loss": 1.0137, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.3501086352382397, |
|
"grad_norm": 1.6305115869655395, |
|
"learning_rate": 1.2081395636545432e-05, |
|
"loss": 1.0074, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.3581931180839777, |
|
"grad_norm": 1.683367869903662, |
|
"learning_rate": 1.1995867097512504e-05, |
|
"loss": 1.0202, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.3662776009297155, |
|
"grad_norm": 1.342629975477622, |
|
"learning_rate": 1.191018622714893e-05, |
|
"loss": 1.0039, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.3743620837754535, |
|
"grad_norm": 1.4162506108365653, |
|
"learning_rate": 1.1824359564908667e-05, |
|
"loss": 1.0303, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3824465666211916, |
|
"grad_norm": 1.4322509952288762, |
|
"learning_rate": 1.1738393661373004e-05, |
|
"loss": 1.0223, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.3905310494669294, |
|
"grad_norm": 1.4429525488762647, |
|
"learning_rate": 1.1652295077750599e-05, |
|
"loss": 1.0079, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.3986155323126674, |
|
"grad_norm": 1.5044521870868257, |
|
"learning_rate": 1.1566070385376705e-05, |
|
"loss": 0.9903, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.4067000151584053, |
|
"grad_norm": 1.4591518605463256, |
|
"learning_rate": 1.1479726165211609e-05, |
|
"loss": 1.0133, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.4147844980041433, |
|
"grad_norm": 1.38699009818023, |
|
"learning_rate": 1.1393269007338375e-05, |
|
"loss": 1.0191, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4228689808498813, |
|
"grad_norm": 1.4248174199771946, |
|
"learning_rate": 1.1306705510459852e-05, |
|
"loss": 1.0048, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.4309534636956192, |
|
"grad_norm": 1.5368128288739022, |
|
"learning_rate": 1.1220042281395042e-05, |
|
"loss": 1.0169, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.4390379465413572, |
|
"grad_norm": 1.620365193180215, |
|
"learning_rate": 1.1133285934574849e-05, |
|
"loss": 0.9982, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.447122429387095, |
|
"grad_norm": 1.4821421519804139, |
|
"learning_rate": 1.1046443091537232e-05, |
|
"loss": 1.0241, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.455206912232833, |
|
"grad_norm": 1.5012997646705204, |
|
"learning_rate": 1.0959520380421831e-05, |
|
"loss": 1.0116, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.463291395078571, |
|
"grad_norm": 1.4878335919543981, |
|
"learning_rate": 1.0872524435464104e-05, |
|
"loss": 0.9993, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.471375877924309, |
|
"grad_norm": 1.3918759318142178, |
|
"learning_rate": 1.0785461896488947e-05, |
|
"loss": 1.0103, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.479460360770047, |
|
"grad_norm": 1.7724767013914755, |
|
"learning_rate": 1.0698339408403944e-05, |
|
"loss": 0.9862, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.487544843615785, |
|
"grad_norm": 2.0093844914876717, |
|
"learning_rate": 1.06111636206922e-05, |
|
"loss": 1.0039, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.4956293264615228, |
|
"grad_norm": 1.4440349729006745, |
|
"learning_rate": 1.0523941186904823e-05, |
|
"loss": 1.0091, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5037138093072608, |
|
"grad_norm": 1.5530469064140777, |
|
"learning_rate": 1.043667876415311e-05, |
|
"loss": 0.9959, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.5117982921529989, |
|
"grad_norm": 1.9710010624543786, |
|
"learning_rate": 1.0349383012600448e-05, |
|
"loss": 0.9902, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.5198827749987367, |
|
"grad_norm": 1.4874119470603941, |
|
"learning_rate": 1.0262060594954e-05, |
|
"loss": 0.9889, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.5279672578444747, |
|
"grad_norm": 1.5760932908781828, |
|
"learning_rate": 1.0174718175956164e-05, |
|
"loss": 0.997, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.5360517406902128, |
|
"grad_norm": 1.5140336706570001, |
|
"learning_rate": 1.0087362421875912e-05, |
|
"loss": 1.0162, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5441362235359506, |
|
"grad_norm": 1.4275012742483075, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0056, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.5522207063816886, |
|
"grad_norm": 1.4479646715349155, |
|
"learning_rate": 9.912637578124092e-06, |
|
"loss": 0.9831, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.5603051892274267, |
|
"grad_norm": 1.6529106306573094, |
|
"learning_rate": 9.825281824043838e-06, |
|
"loss": 1.0009, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.5683896720731645, |
|
"grad_norm": 1.4537655155385498, |
|
"learning_rate": 9.737939405046002e-06, |
|
"loss": 1.0058, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.5764741549189025, |
|
"grad_norm": 1.3881828231981752, |
|
"learning_rate": 9.650616987399553e-06, |
|
"loss": 0.9752, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.5845586377646406, |
|
"grad_norm": 1.4410127433172688, |
|
"learning_rate": 9.563321235846894e-06, |
|
"loss": 1.0026, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.5926431206103784, |
|
"grad_norm": 1.6585729752037028, |
|
"learning_rate": 9.476058813095182e-06, |
|
"loss": 0.9942, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.6007276034561164, |
|
"grad_norm": 1.6572316797520206, |
|
"learning_rate": 9.388836379307802e-06, |
|
"loss": 0.9968, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.6088120863018545, |
|
"grad_norm": 1.451151024162774, |
|
"learning_rate": 9.301660591596059e-06, |
|
"loss": 0.9921, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6168965691475923, |
|
"grad_norm": 1.5042478185497792, |
|
"learning_rate": 9.214538103511053e-06, |
|
"loss": 0.9959, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6249810519933303, |
|
"grad_norm": 1.4096442655309245, |
|
"learning_rate": 9.127475564535898e-06, |
|
"loss": 0.9944, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.6330655348390684, |
|
"grad_norm": 1.3701103693221475, |
|
"learning_rate": 9.04047961957817e-06, |
|
"loss": 0.9806, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.6411500176848062, |
|
"grad_norm": 1.6771886101217564, |
|
"learning_rate": 8.953556908462773e-06, |
|
"loss": 0.9986, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.6492345005305442, |
|
"grad_norm": 1.4606744478213272, |
|
"learning_rate": 8.866714065425154e-06, |
|
"loss": 0.9894, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.6573189833762823, |
|
"grad_norm": 1.5696191298486186, |
|
"learning_rate": 8.779957718604956e-06, |
|
"loss": 1.0055, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.66540346622202, |
|
"grad_norm": 1.4621439613400917, |
|
"learning_rate": 8.693294489540151e-06, |
|
"loss": 1.0055, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.673487949067758, |
|
"grad_norm": 1.4224764910826249, |
|
"learning_rate": 8.60673099266163e-06, |
|
"loss": 0.9687, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.6815724319134961, |
|
"grad_norm": 1.6938323822086323, |
|
"learning_rate": 8.520273834788395e-06, |
|
"loss": 0.978, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.689656914759234, |
|
"grad_norm": 1.5856717495753165, |
|
"learning_rate": 8.4339296146233e-06, |
|
"loss": 0.992, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.697741397604972, |
|
"grad_norm": 1.4737528022353619, |
|
"learning_rate": 8.3477049222494e-06, |
|
"loss": 0.9882, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.70582588045071, |
|
"grad_norm": 1.4413576604331515, |
|
"learning_rate": 8.261606338626998e-06, |
|
"loss": 0.9717, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.7139103632964479, |
|
"grad_norm": 1.4533604100239785, |
|
"learning_rate": 8.17564043509134e-06, |
|
"loss": 0.9878, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.7219948461421857, |
|
"grad_norm": 1.4996211527080612, |
|
"learning_rate": 8.089813772851073e-06, |
|
"loss": 0.9932, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.730079328987924, |
|
"grad_norm": 1.4183735479797297, |
|
"learning_rate": 8.004132902487499e-06, |
|
"loss": 1.0021, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.7381638118336618, |
|
"grad_norm": 1.4020103234354604, |
|
"learning_rate": 7.91860436345457e-06, |
|
"loss": 0.9717, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.7462482946793996, |
|
"grad_norm": 1.4529101522297827, |
|
"learning_rate": 7.833234683579806e-06, |
|
"loss": 0.9844, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.7543327775251378, |
|
"grad_norm": 1.4502465958251158, |
|
"learning_rate": 7.748030378566056e-06, |
|
"loss": 0.9782, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.7624172603708756, |
|
"grad_norm": 1.4461707858445054, |
|
"learning_rate": 7.662997951494193e-06, |
|
"loss": 0.9836, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.7705017432166135, |
|
"grad_norm": 1.3966480403360386, |
|
"learning_rate": 7.578143892326797e-06, |
|
"loss": 1.0089, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.7785862260623517, |
|
"grad_norm": 1.5838575969719086, |
|
"learning_rate": 7.493474677412795e-06, |
|
"loss": 1.0017, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.7866707089080895, |
|
"grad_norm": 1.6412461821364432, |
|
"learning_rate": 7.408996768993184e-06, |
|
"loss": 0.9889, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.7947551917538274, |
|
"grad_norm": 1.8686882471940454, |
|
"learning_rate": 7.324716614707794e-06, |
|
"loss": 0.9814, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.8028396745995656, |
|
"grad_norm": 1.4444454657231485, |
|
"learning_rate": 7.240640647103192e-06, |
|
"loss": 0.9934, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.8109241574453034, |
|
"grad_norm": 1.5880994051473134, |
|
"learning_rate": 7.156775283141733e-06, |
|
"loss": 0.9972, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.8190086402910413, |
|
"grad_norm": 1.6179768250952558, |
|
"learning_rate": 7.0731269237117775e-06, |
|
"loss": 0.9805, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.8270931231367793, |
|
"grad_norm": 1.4161571668846493, |
|
"learning_rate": 6.989701953139181e-06, |
|
"loss": 0.9695, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.8351776059825173, |
|
"grad_norm": 1.8752619329260358, |
|
"learning_rate": 6.906506738699994e-06, |
|
"loss": 0.9899, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.8432620888282552, |
|
"grad_norm": 1.8476640791436918, |
|
"learning_rate": 6.823547630134497e-06, |
|
"loss": 0.9799, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.8513465716739932, |
|
"grad_norm": 1.5003229948984453, |
|
"learning_rate": 6.740830959162592e-06, |
|
"loss": 0.9948, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.8594310545197312, |
|
"grad_norm": 1.4363919724793655, |
|
"learning_rate": 6.658363039000501e-06, |
|
"loss": 0.9625, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.867515537365469, |
|
"grad_norm": 1.45857815520064, |
|
"learning_rate": 6.57615016387896e-06, |
|
"loss": 0.976, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.875600020211207, |
|
"grad_norm": 1.3637017381911254, |
|
"learning_rate": 6.4941986085627895e-06, |
|
"loss": 0.9608, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.8836845030569451, |
|
"grad_norm": 1.586134857640991, |
|
"learning_rate": 6.412514627872003e-06, |
|
"loss": 0.9702, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.891768985902683, |
|
"grad_norm": 1.6293874205755696, |
|
"learning_rate": 6.331104456204423e-06, |
|
"loss": 0.9672, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.899853468748421, |
|
"grad_norm": 1.6185456719315228, |
|
"learning_rate": 6.249974307059826e-06, |
|
"loss": 0.9683, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.907937951594159, |
|
"grad_norm": 1.5897776438113254, |
|
"learning_rate": 6.169130372565737e-06, |
|
"loss": 0.9942, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.9160224344398968, |
|
"grad_norm": 1.4621464766459995, |
|
"learning_rate": 6.088578823004796e-06, |
|
"loss": 0.9552, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.9241069172856349, |
|
"grad_norm": 1.57419066036152, |
|
"learning_rate": 6.008325806343842e-06, |
|
"loss": 0.9635, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.932191400131373, |
|
"grad_norm": 1.4154240767952921, |
|
"learning_rate": 5.9283774477646775e-06, |
|
"loss": 0.9661, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.9402758829771107, |
|
"grad_norm": 1.4089774352311322, |
|
"learning_rate": 5.848739849196556e-06, |
|
"loss": 0.9623, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9483603658228488, |
|
"grad_norm": 1.4330997113061938, |
|
"learning_rate": 5.7694190888504964e-06, |
|
"loss": 0.982, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.9564448486685868, |
|
"grad_norm": 1.762833270995275, |
|
"learning_rate": 5.690421220755329e-06, |
|
"loss": 0.968, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.9645293315143246, |
|
"grad_norm": 1.57370551896378, |
|
"learning_rate": 5.611752274295665e-06, |
|
"loss": 0.9639, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.9726138143600627, |
|
"grad_norm": 1.4682932578058885, |
|
"learning_rate": 5.533418253751714e-06, |
|
"loss": 0.9786, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.9806982972058007, |
|
"grad_norm": 1.7633821953728437, |
|
"learning_rate": 5.455425137840987e-06, |
|
"loss": 0.9618, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.9887827800515385, |
|
"grad_norm": 1.5018261369656176, |
|
"learning_rate": 5.377778879262017e-06, |
|
"loss": 0.9454, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.9968672628972766, |
|
"grad_norm": 1.5404280086355402, |
|
"learning_rate": 5.300485404239999e-06, |
|
"loss": 0.9628, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.999292607750998, |
|
"eval_loss": 0.8751075863838196, |
|
"eval_runtime": 481.67, |
|
"eval_samples_per_second": 25.254, |
|
"eval_steps_per_second": 12.627, |
|
"step": 2473 |
|
}, |
|
{ |
|
"epoch": 2.0049517457430146, |
|
"grad_norm": 1.8577507088673693, |
|
"learning_rate": 5.223550612074497e-06, |
|
"loss": 0.8752, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.0130362285887524, |
|
"grad_norm": 1.5570324756102374, |
|
"learning_rate": 5.146980374689192e-06, |
|
"loss": 0.8398, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.0211207114344902, |
|
"grad_norm": 1.645225536576169, |
|
"learning_rate": 5.070780536183698e-06, |
|
"loss": 0.856, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.0292051942802285, |
|
"grad_norm": 1.6698633554870226, |
|
"learning_rate": 4.99495691238755e-06, |
|
"loss": 0.8365, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.0372896771259663, |
|
"grad_norm": 2.010967933907663, |
|
"learning_rate": 4.9195152904162865e-06, |
|
"loss": 0.8308, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.045374159971704, |
|
"grad_norm": 1.4592026658551123, |
|
"learning_rate": 4.844461428229782e-06, |
|
"loss": 0.8387, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.0534586428174424, |
|
"grad_norm": 1.9716723547932462, |
|
"learning_rate": 4.769801054192776e-06, |
|
"loss": 0.8374, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.06154312566318, |
|
"grad_norm": 1.6334367414667887, |
|
"learning_rate": 4.695539866637653e-06, |
|
"loss": 0.8587, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.069627608508918, |
|
"grad_norm": 1.713926689166813, |
|
"learning_rate": 4.6216835334295385e-06, |
|
"loss": 0.8376, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.0777120913546563, |
|
"grad_norm": 1.5714175555320091, |
|
"learning_rate": 4.548237691533699e-06, |
|
"loss": 0.8346, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.085796574200394, |
|
"grad_norm": 1.4811489223457255, |
|
"learning_rate": 4.475207946585328e-06, |
|
"loss": 0.8473, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.093881057046132, |
|
"grad_norm": 1.4400201402098334, |
|
"learning_rate": 4.402599872461678e-06, |
|
"loss": 0.8309, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.10196553989187, |
|
"grad_norm": 1.5527150219002093, |
|
"learning_rate": 4.330419010856661e-06, |
|
"loss": 0.8312, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.110050022737608, |
|
"grad_norm": 1.4540137626455856, |
|
"learning_rate": 4.258670870857894e-06, |
|
"loss": 0.8461, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.118134505583346, |
|
"grad_norm": 1.5200526871374724, |
|
"learning_rate": 4.187360928526198e-06, |
|
"loss": 0.8353, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.126218988429084, |
|
"grad_norm": 1.487656190760893, |
|
"learning_rate": 4.116494626477684e-06, |
|
"loss": 0.842, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.134303471274822, |
|
"grad_norm": 1.4541876796717628, |
|
"learning_rate": 4.046077373468325e-06, |
|
"loss": 0.8285, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.1423879541205597, |
|
"grad_norm": 1.515080712913025, |
|
"learning_rate": 3.976114543981148e-06, |
|
"loss": 0.8278, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.150472436966298, |
|
"grad_norm": 1.5925627792233104, |
|
"learning_rate": 3.906611477816054e-06, |
|
"loss": 0.8382, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.158556919812036, |
|
"grad_norm": 1.4749306746231339, |
|
"learning_rate": 3.837573479682236e-06, |
|
"loss": 0.8453, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.1666414026577736, |
|
"grad_norm": 1.888042329530717, |
|
"learning_rate": 3.769005818793329e-06, |
|
"loss": 0.854, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.174725885503512, |
|
"grad_norm": 1.598037794600047, |
|
"learning_rate": 3.7009137284652386e-06, |
|
"loss": 0.8519, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.1828103683492497, |
|
"grad_norm": 1.5540837615094885, |
|
"learning_rate": 3.633302405716712e-06, |
|
"loss": 0.8397, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.1908948511949875, |
|
"grad_norm": 1.430485289060877, |
|
"learning_rate": 3.5661770108726914e-06, |
|
"loss": 0.8271, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.1989793340407258, |
|
"grad_norm": 2.401835949374892, |
|
"learning_rate": 3.4995426671704493e-06, |
|
"loss": 0.8335, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.2070638168864636, |
|
"grad_norm": 1.506353292247366, |
|
"learning_rate": 3.433404460368587e-06, |
|
"loss": 0.828, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.2151482997322014, |
|
"grad_norm": 1.4406717845115946, |
|
"learning_rate": 3.3677674383588476e-06, |
|
"loss": 0.8315, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.2232327825779397, |
|
"grad_norm": 1.5393945850323205, |
|
"learning_rate": 3.302636610780855e-06, |
|
"loss": 0.8504, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.2313172654236775, |
|
"grad_norm": 1.7257558230682333, |
|
"learning_rate": 3.238016948639772e-06, |
|
"loss": 0.8232, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.2394017482694153, |
|
"grad_norm": 1.8326756661400847, |
|
"learning_rate": 3.1739133839268698e-06, |
|
"loss": 0.8154, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.2474862311151536, |
|
"grad_norm": 1.5269518503128512, |
|
"learning_rate": 3.110330809243134e-06, |
|
"loss": 0.8317, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.2555707139608914, |
|
"grad_norm": 1.504166909878008, |
|
"learning_rate": 3.0472740774258157e-06, |
|
"loss": 0.8368, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.263655196806629, |
|
"grad_norm": 1.480047137104623, |
|
"learning_rate": 2.9847480011780607e-06, |
|
"loss": 0.8409, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.2717396796523674, |
|
"grad_norm": 1.492023552078346, |
|
"learning_rate": 2.922757352701595e-06, |
|
"loss": 0.8243, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.2798241624981053, |
|
"grad_norm": 1.467055149697424, |
|
"learning_rate": 2.861306863332475e-06, |
|
"loss": 0.8289, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.287908645343843, |
|
"grad_norm": 1.504514345406056, |
|
"learning_rate": 2.8004012231799905e-06, |
|
"loss": 0.8375, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.295993128189581, |
|
"grad_norm": 1.5091792435489357, |
|
"learning_rate": 2.740045080768694e-06, |
|
"loss": 0.8233, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.304077611035319, |
|
"grad_norm": 1.4619080284602382, |
|
"learning_rate": 2.6802430426836113e-06, |
|
"loss": 0.8356, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.312162093881057, |
|
"grad_norm": 1.4085751552174153, |
|
"learning_rate": 2.620999673218656e-06, |
|
"loss": 0.8156, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.3202465767267952, |
|
"grad_norm": 1.4755258769825808, |
|
"learning_rate": 2.5623194940282526e-06, |
|
"loss": 0.8353, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.328331059572533, |
|
"grad_norm": 1.5852343601430656, |
|
"learning_rate": 2.504206983782248e-06, |
|
"loss": 0.8133, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.336415542418271, |
|
"grad_norm": 1.4903107631764194, |
|
"learning_rate": 2.446666577824068e-06, |
|
"loss": 0.8459, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.3445000252640087, |
|
"grad_norm": 1.523719484539125, |
|
"learning_rate": 2.389702667832202e-06, |
|
"loss": 0.8285, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.352584508109747, |
|
"grad_norm": 1.457321496284554, |
|
"learning_rate": 2.3333196014850246e-06, |
|
"loss": 0.8304, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.3606689909554848, |
|
"grad_norm": 1.537434676857527, |
|
"learning_rate": 2.277521682128947e-06, |
|
"loss": 0.829, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.3687534738012226, |
|
"grad_norm": 1.4707817420987006, |
|
"learning_rate": 2.2223131684499932e-06, |
|
"loss": 0.8372, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.376837956646961, |
|
"grad_norm": 1.46749047915079, |
|
"learning_rate": 2.1676982741487427e-06, |
|
"loss": 0.8222, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.3849224394926987, |
|
"grad_norm": 1.518122852634397, |
|
"learning_rate": 2.113681167618736e-06, |
|
"loss": 0.8401, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.3930069223384365, |
|
"grad_norm": 1.8575848589445734, |
|
"learning_rate": 2.060265971628338e-06, |
|
"loss": 0.8339, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.4010914051841747, |
|
"grad_norm": 1.5601145654381285, |
|
"learning_rate": 2.0074567630060514e-06, |
|
"loss": 0.8154, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.4091758880299126, |
|
"grad_norm": 1.530898387002521, |
|
"learning_rate": 1.955257572329379e-06, |
|
"loss": 0.823, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.4172603708756504, |
|
"grad_norm": 1.6224545445427798, |
|
"learning_rate": 1.9036723836171899e-06, |
|
"loss": 0.8145, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.4253448537213886, |
|
"grad_norm": 1.4013679708594033, |
|
"learning_rate": 1.8527051340256397e-06, |
|
"loss": 0.8215, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.4334293365671265, |
|
"grad_norm": 1.5692785609667004, |
|
"learning_rate": 1.8023597135476923e-06, |
|
"loss": 0.8241, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.4415138194128643, |
|
"grad_norm": 1.5126974695662643, |
|
"learning_rate": 1.752639964716193e-06, |
|
"loss": 0.8421, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.4495983022586025, |
|
"grad_norm": 1.6242742569822604, |
|
"learning_rate": 1.7035496823106247e-06, |
|
"loss": 0.8141, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.4576827851043404, |
|
"grad_norm": 1.4628790110692993, |
|
"learning_rate": 1.6550926130674527e-06, |
|
"loss": 0.8184, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.465767267950078, |
|
"grad_norm": 1.4807837431822446, |
|
"learning_rate": 1.607272455394172e-06, |
|
"loss": 0.8202, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.4738517507958164, |
|
"grad_norm": 1.5539937903441552, |
|
"learning_rate": 1.5600928590870402e-06, |
|
"loss": 0.8391, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.4819362336415542, |
|
"grad_norm": 1.6677495360703212, |
|
"learning_rate": 1.5135574250524898e-06, |
|
"loss": 0.8436, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.490020716487292, |
|
"grad_norm": 1.53769857798961, |
|
"learning_rate": 1.467669705032323e-06, |
|
"loss": 0.8263, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.4981051993330303, |
|
"grad_norm": 1.4732928239069325, |
|
"learning_rate": 1.422433201332607e-06, |
|
"loss": 0.8284, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.506189682178768, |
|
"grad_norm": 1.5928757648188723, |
|
"learning_rate": 1.3778513665563786e-06, |
|
"loss": 0.8319, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.514274165024506, |
|
"grad_norm": 1.4230928346180836, |
|
"learning_rate": 1.3339276033401283e-06, |
|
"loss": 0.8052, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.522358647870244, |
|
"grad_norm": 1.4772661299744003, |
|
"learning_rate": 1.290665264094093e-06, |
|
"loss": 0.8241, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.530443130715982, |
|
"grad_norm": 1.522091825661006, |
|
"learning_rate": 1.2480676507464051e-06, |
|
"loss": 0.8106, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.53852761356172, |
|
"grad_norm": 1.525599170654266, |
|
"learning_rate": 1.2061380144910572e-06, |
|
"loss": 0.8166, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.5466120964074577, |
|
"grad_norm": 1.4929327017491605, |
|
"learning_rate": 1.1648795555397719e-06, |
|
"loss": 0.8251, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.554696579253196, |
|
"grad_norm": 1.5920001415947864, |
|
"learning_rate": 1.1242954228777513e-06, |
|
"loss": 0.8268, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.5627810620989337, |
|
"grad_norm": 1.5252651359986042, |
|
"learning_rate": 1.08438871402333e-06, |
|
"loss": 0.831, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.570865544944672, |
|
"grad_norm": 1.6461347768103347, |
|
"learning_rate": 1.04516247479157e-06, |
|
"loss": 0.8239, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.57895002779041, |
|
"grad_norm": 1.490863354097273, |
|
"learning_rate": 1.006619699061785e-06, |
|
"loss": 0.823, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.5870345106361476, |
|
"grad_norm": 1.5158841203253022, |
|
"learning_rate": 9.687633285490395e-07, |
|
"loss": 0.8333, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.5951189934818855, |
|
"grad_norm": 1.4861408651974157, |
|
"learning_rate": 9.315962525796374e-07, |
|
"loss": 0.8178, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.6032034763276237, |
|
"grad_norm": 1.4847726389856295, |
|
"learning_rate": 8.951213078705811e-07, |
|
"loss": 0.8244, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.6112879591733615, |
|
"grad_norm": 1.4579228976188288, |
|
"learning_rate": 8.593412783130805e-07, |
|
"loss": 0.8116, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.6193724420191, |
|
"grad_norm": 1.4309284818257009, |
|
"learning_rate": 8.24258894760066e-07, |
|
"loss": 0.8233, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.6274569248648376, |
|
"grad_norm": 1.481662266621092, |
|
"learning_rate": 7.898768348177643e-07, |
|
"loss": 0.8393, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.6355414077105754, |
|
"grad_norm": 1.42582017885812, |
|
"learning_rate": 7.561977226413341e-07, |
|
"loss": 0.8344, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.6436258905563133, |
|
"grad_norm": 1.4203791210214531, |
|
"learning_rate": 7.23224128734582e-07, |
|
"loss": 0.821, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.6517103734020515, |
|
"grad_norm": 1.4780417621137758, |
|
"learning_rate": 6.909585697537758e-07, |
|
"loss": 0.8353, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.6597948562477893, |
|
"grad_norm": 1.4466612391449976, |
|
"learning_rate": 6.594035083155581e-07, |
|
"loss": 0.8268, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.6678793390935276, |
|
"grad_norm": 1.4584592752103582, |
|
"learning_rate": 6.285613528089962e-07, |
|
"loss": 0.8164, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.6759638219392654, |
|
"grad_norm": 1.487514724946772, |
|
"learning_rate": 5.98434457211765e-07, |
|
"loss": 0.8027, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.6840483047850032, |
|
"grad_norm": 1.4294666752405771, |
|
"learning_rate": 5.690251209104802e-07, |
|
"loss": 0.8105, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.692132787630741, |
|
"grad_norm": 1.4638925402226952, |
|
"learning_rate": 5.403355885252104e-07, |
|
"loss": 0.8135, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.7002172704764793, |
|
"grad_norm": 1.4458763488108235, |
|
"learning_rate": 5.123680497381444e-07, |
|
"loss": 0.8102, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.708301753322217, |
|
"grad_norm": 1.4903596037049076, |
|
"learning_rate": 4.851246391264819e-07, |
|
"loss": 0.8152, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.7163862361679554, |
|
"grad_norm": 1.4429528216246368, |
|
"learning_rate": 4.5860743599951186e-07, |
|
"loss": 0.8121, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.724470719013693, |
|
"grad_norm": 1.452035259914063, |
|
"learning_rate": 4.328184642399036e-07, |
|
"loss": 0.821, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.732555201859431, |
|
"grad_norm": 1.5303877229228735, |
|
"learning_rate": 4.077596921492533e-07, |
|
"loss": 0.8145, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.740639684705169, |
|
"grad_norm": 1.4449405328561624, |
|
"learning_rate": 3.834330322978397e-07, |
|
"loss": 0.8214, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.748724167550907, |
|
"grad_norm": 1.4371584227135465, |
|
"learning_rate": 3.598403413786611e-07, |
|
"loss": 0.8131, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.756808650396645, |
|
"grad_norm": 1.4632980675092546, |
|
"learning_rate": 3.3698342006572294e-07, |
|
"loss": 0.8244, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.764893133242383, |
|
"grad_norm": 1.4500755832832954, |
|
"learning_rate": 3.148640128766056e-07, |
|
"loss": 0.823, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.772977616088121, |
|
"grad_norm": 1.4751477866660623, |
|
"learning_rate": 2.934838080393154e-07, |
|
"loss": 0.8211, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.781062098933859, |
|
"grad_norm": 1.4653755137740456, |
|
"learning_rate": 2.7284443736343203e-07, |
|
"loss": 0.8024, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.7891465817795966, |
|
"grad_norm": 1.4089563044736344, |
|
"learning_rate": 2.52947476115567e-07, |
|
"loss": 0.8228, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.797231064625335, |
|
"grad_norm": 1.460696621649454, |
|
"learning_rate": 2.3379444289913344e-07, |
|
"loss": 0.8184, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.8053155474710727, |
|
"grad_norm": 1.4693334824298931, |
|
"learning_rate": 2.153867995384351e-07, |
|
"loss": 0.8224, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.8134000303168105, |
|
"grad_norm": 1.4469954005038157, |
|
"learning_rate": 1.9772595096710477e-07, |
|
"loss": 0.8373, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.821484513162549, |
|
"grad_norm": 1.4331150676229163, |
|
"learning_rate": 1.8081324512086663e-07, |
|
"loss": 0.8185, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.8295689960082866, |
|
"grad_norm": 1.5335384382024873, |
|
"learning_rate": 1.6464997283466067e-07, |
|
"loss": 0.8124, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.8376534788540244, |
|
"grad_norm": 1.4445147972537609, |
|
"learning_rate": 1.492373677441228e-07, |
|
"loss": 0.8145, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.8457379616997627, |
|
"grad_norm": 1.4976188260457166, |
|
"learning_rate": 1.3457660619142887e-07, |
|
"loss": 0.8163, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.8538224445455005, |
|
"grad_norm": 1.439452743377751, |
|
"learning_rate": 1.2066880713550888e-07, |
|
"loss": 0.829, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.8619069273912383, |
|
"grad_norm": 1.524984754735583, |
|
"learning_rate": 1.0751503206665071e-07, |
|
"loss": 0.8236, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.8699914102369766, |
|
"grad_norm": 1.448229914768272, |
|
"learning_rate": 9.511628492547609e-08, |
|
"loss": 0.8223, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.8780758930827144, |
|
"grad_norm": 1.4915344957228824, |
|
"learning_rate": 8.347351202632525e-08, |
|
"loss": 0.843, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.886160375928452, |
|
"grad_norm": 1.4891660841319714, |
|
"learning_rate": 7.258760198502246e-08, |
|
"loss": 0.8173, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.89424485877419, |
|
"grad_norm": 1.4485487573496472, |
|
"learning_rate": 6.245938565105803e-08, |
|
"loss": 0.8299, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.9023293416199283, |
|
"grad_norm": 1.452602418516034, |
|
"learning_rate": 5.308963604417572e-08, |
|
"loss": 0.8216, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.910413824465666, |
|
"grad_norm": 1.4554407329371093, |
|
"learning_rate": 4.447906829537219e-08, |
|
"loss": 0.8284, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.9184983073114044, |
|
"grad_norm": 1.4918607001029844, |
|
"learning_rate": 3.6628339592313935e-08, |
|
"loss": 0.8012, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.926582790157142, |
|
"grad_norm": 1.4229324193215207, |
|
"learning_rate": 2.95380491291819e-08, |
|
"loss": 0.8401, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.93466727300288, |
|
"grad_norm": 1.4288366788035922, |
|
"learning_rate": 2.320873806093804e-08, |
|
"loss": 0.8228, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.942751755848618, |
|
"grad_norm": 1.4724134547959333, |
|
"learning_rate": 1.764088946201947e-08, |
|
"loss": 0.8064, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.950836238694356, |
|
"grad_norm": 1.4984479737935563, |
|
"learning_rate": 1.2834928289472415e-08, |
|
"loss": 0.81, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.958920721540094, |
|
"grad_norm": 1.4666816312445612, |
|
"learning_rate": 8.79122135051591e-09, |
|
"loss": 0.822, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.967005204385832, |
|
"grad_norm": 1.445201429621803, |
|
"learning_rate": 5.510077274547554e-09, |
|
"loss": 0.8271, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.97508968723157, |
|
"grad_norm": 1.4460059967392547, |
|
"learning_rate": 2.9917464895856673e-09, |
|
"loss": 0.8389, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.983174170077308, |
|
"grad_norm": 1.435390156627942, |
|
"learning_rate": 1.2364212031579226e-09, |
|
"loss": 0.8294, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.9912586529230456, |
|
"grad_norm": 1.5066669703721747, |
|
"learning_rate": 2.442353876297432e-10, |
|
"loss": 0.801, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.997726239199636, |
|
"eval_loss": 0.8224219083786011, |
|
"eval_runtime": 474.463, |
|
"eval_samples_per_second": 25.637, |
|
"eval_steps_per_second": 12.819, |
|
"step": 3708 |
|
}, |
|
{ |
|
"epoch": 2.997726239199636, |
|
"step": 3708, |
|
"total_flos": 0.0, |
|
"train_loss": 1.0273753281164324, |
|
"train_runtime": 58675.1239, |
|
"train_samples_per_second": 8.095, |
|
"train_steps_per_second": 0.063 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3708, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|