|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3169845594913716, |
|
"eval_steps": 500, |
|
"global_step": 4350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009082652134423252, |
|
"grad_norm": 2.5449585914611816, |
|
"learning_rate": 4.9848622464426284e-05, |
|
"loss": 8.7191, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018165304268846504, |
|
"grad_norm": 2.3144371509552, |
|
"learning_rate": 4.969724492885256e-05, |
|
"loss": 7.4698, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.027247956403269755, |
|
"grad_norm": 2.304499626159668, |
|
"learning_rate": 4.954586739327884e-05, |
|
"loss": 6.5589, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03633060853769301, |
|
"grad_norm": 2.272608757019043, |
|
"learning_rate": 4.9394489857705115e-05, |
|
"loss": 6.2425, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.045413260672116255, |
|
"grad_norm": 2.46329402923584, |
|
"learning_rate": 4.9243112322131396e-05, |
|
"loss": 6.1459, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05449591280653951, |
|
"grad_norm": 1.8283530473709106, |
|
"learning_rate": 4.909173478655768e-05, |
|
"loss": 5.969, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06357856494096276, |
|
"grad_norm": 2.1723110675811768, |
|
"learning_rate": 4.894035725098395e-05, |
|
"loss": 6.008, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07266121707538602, |
|
"grad_norm": 2.5368807315826416, |
|
"learning_rate": 4.878897971541024e-05, |
|
"loss": 5.8783, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08174386920980926, |
|
"grad_norm": 2.3222858905792236, |
|
"learning_rate": 4.8637602179836515e-05, |
|
"loss": 5.825, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09082652134423251, |
|
"grad_norm": 2.557065010070801, |
|
"learning_rate": 4.8486224644262796e-05, |
|
"loss": 5.76, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09990917347865577, |
|
"grad_norm": 2.4016597270965576, |
|
"learning_rate": 4.833484710868907e-05, |
|
"loss": 5.7039, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.10899182561307902, |
|
"grad_norm": 2.6895477771759033, |
|
"learning_rate": 4.818346957311535e-05, |
|
"loss": 5.5843, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.11807447774750227, |
|
"grad_norm": 2.741234064102173, |
|
"learning_rate": 4.8032092037541634e-05, |
|
"loss": 5.6376, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1271571298819255, |
|
"grad_norm": 2.8266804218292236, |
|
"learning_rate": 4.788071450196791e-05, |
|
"loss": 5.5649, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1362397820163488, |
|
"grad_norm": 2.792654275894165, |
|
"learning_rate": 4.772933696639419e-05, |
|
"loss": 5.3651, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.14532243415077203, |
|
"grad_norm": 2.7088894844055176, |
|
"learning_rate": 4.757795943082047e-05, |
|
"loss": 5.4921, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.15440508628519528, |
|
"grad_norm": 2.627201795578003, |
|
"learning_rate": 4.7426581895246746e-05, |
|
"loss": 5.461, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.16348773841961853, |
|
"grad_norm": 2.6373610496520996, |
|
"learning_rate": 4.727520435967303e-05, |
|
"loss": 5.3973, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.17257039055404177, |
|
"grad_norm": 2.772226095199585, |
|
"learning_rate": 4.71238268240993e-05, |
|
"loss": 5.3618, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.18165304268846502, |
|
"grad_norm": 2.6005172729492188, |
|
"learning_rate": 4.6972449288525583e-05, |
|
"loss": 5.4365, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1907356948228883, |
|
"grad_norm": 4.7815260887146, |
|
"learning_rate": 4.6821071752951865e-05, |
|
"loss": 5.3225, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.19981834695731154, |
|
"grad_norm": 2.5871763229370117, |
|
"learning_rate": 4.6669694217378146e-05, |
|
"loss": 5.3615, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2089009990917348, |
|
"grad_norm": 2.686840534210205, |
|
"learning_rate": 4.651831668180443e-05, |
|
"loss": 5.3201, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.21798365122615804, |
|
"grad_norm": 2.6963067054748535, |
|
"learning_rate": 4.63669391462307e-05, |
|
"loss": 5.1972, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.22706630336058128, |
|
"grad_norm": 2.9284744262695312, |
|
"learning_rate": 4.6215561610656984e-05, |
|
"loss": 5.3031, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.23614895549500453, |
|
"grad_norm": 2.7302122116088867, |
|
"learning_rate": 4.606418407508326e-05, |
|
"loss": 5.2057, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2452316076294278, |
|
"grad_norm": 2.5760107040405273, |
|
"learning_rate": 4.591280653950954e-05, |
|
"loss": 5.1767, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.254314259763851, |
|
"grad_norm": 2.9804234504699707, |
|
"learning_rate": 4.576142900393582e-05, |
|
"loss": 5.1875, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2633969118982743, |
|
"grad_norm": 3.311448812484741, |
|
"learning_rate": 4.5610051468362096e-05, |
|
"loss": 5.0712, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2724795640326976, |
|
"grad_norm": 2.67448091506958, |
|
"learning_rate": 4.545867393278838e-05, |
|
"loss": 5.1241, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2815622161671208, |
|
"grad_norm": 2.8352444171905518, |
|
"learning_rate": 4.530729639721465e-05, |
|
"loss": 5.1732, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.29064486830154407, |
|
"grad_norm": 2.5969910621643066, |
|
"learning_rate": 4.515591886164093e-05, |
|
"loss": 5.0828, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2997275204359673, |
|
"grad_norm": 2.8792121410369873, |
|
"learning_rate": 4.5004541326067215e-05, |
|
"loss": 5.0844, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.30881017257039056, |
|
"grad_norm": 2.9506993293762207, |
|
"learning_rate": 4.485316379049349e-05, |
|
"loss": 5.1764, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3178928247048138, |
|
"grad_norm": 2.8818390369415283, |
|
"learning_rate": 4.470178625491977e-05, |
|
"loss": 5.0663, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.32697547683923706, |
|
"grad_norm": 3.128511667251587, |
|
"learning_rate": 4.4550408719346046e-05, |
|
"loss": 5.1026, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.33605812897366033, |
|
"grad_norm": 3.0155856609344482, |
|
"learning_rate": 4.4399031183772334e-05, |
|
"loss": 5.0686, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.34514078110808355, |
|
"grad_norm": 2.811448097229004, |
|
"learning_rate": 4.424765364819861e-05, |
|
"loss": 5.0351, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3542234332425068, |
|
"grad_norm": 2.9916000366210938, |
|
"learning_rate": 4.409627611262489e-05, |
|
"loss": 5.1651, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.36330608537693004, |
|
"grad_norm": 2.9689950942993164, |
|
"learning_rate": 4.394489857705117e-05, |
|
"loss": 5.1457, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3723887375113533, |
|
"grad_norm": 2.7896862030029297, |
|
"learning_rate": 4.3793521041477446e-05, |
|
"loss": 5.0049, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3814713896457766, |
|
"grad_norm": 2.790712833404541, |
|
"learning_rate": 4.364214350590373e-05, |
|
"loss": 4.9943, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3905540417801998, |
|
"grad_norm": 2.9977900981903076, |
|
"learning_rate": 4.349076597033e-05, |
|
"loss": 4.996, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3996366939146231, |
|
"grad_norm": 3.504183530807495, |
|
"learning_rate": 4.333938843475628e-05, |
|
"loss": 4.9611, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4087193460490463, |
|
"grad_norm": 2.737821578979492, |
|
"learning_rate": 4.3188010899182565e-05, |
|
"loss": 4.9541, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4178019981834696, |
|
"grad_norm": 3.0585217475891113, |
|
"learning_rate": 4.303663336360884e-05, |
|
"loss": 4.9014, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4268846503178928, |
|
"grad_norm": 3.004413604736328, |
|
"learning_rate": 4.288525582803512e-05, |
|
"loss": 4.9703, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4359673024523161, |
|
"grad_norm": 2.9328274726867676, |
|
"learning_rate": 4.27338782924614e-05, |
|
"loss": 4.9637, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.44504995458673935, |
|
"grad_norm": 2.93721604347229, |
|
"learning_rate": 4.258250075688768e-05, |
|
"loss": 4.8024, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.45413260672116257, |
|
"grad_norm": 3.0333001613616943, |
|
"learning_rate": 4.243112322131396e-05, |
|
"loss": 4.8555, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.46321525885558584, |
|
"grad_norm": 3.3445775508880615, |
|
"learning_rate": 4.227974568574024e-05, |
|
"loss": 4.8035, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.47229791099000906, |
|
"grad_norm": 2.9364359378814697, |
|
"learning_rate": 4.212836815016652e-05, |
|
"loss": 4.9296, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.48138056312443234, |
|
"grad_norm": 2.755453586578369, |
|
"learning_rate": 4.1976990614592796e-05, |
|
"loss": 4.8051, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4904632152588556, |
|
"grad_norm": 3.0365066528320312, |
|
"learning_rate": 4.182561307901908e-05, |
|
"loss": 4.7833, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.49954586739327883, |
|
"grad_norm": 3.2632575035095215, |
|
"learning_rate": 4.167423554344536e-05, |
|
"loss": 4.837, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.508628519527702, |
|
"grad_norm": 3.310817003250122, |
|
"learning_rate": 4.152285800787163e-05, |
|
"loss": 4.7417, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5177111716621253, |
|
"grad_norm": 3.121156692504883, |
|
"learning_rate": 4.1371480472297915e-05, |
|
"loss": 4.7791, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5267938237965486, |
|
"grad_norm": 3.200591564178467, |
|
"learning_rate": 4.122010293672419e-05, |
|
"loss": 4.8619, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5358764759309719, |
|
"grad_norm": 3.1420202255249023, |
|
"learning_rate": 4.106872540115047e-05, |
|
"loss": 4.7576, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5449591280653951, |
|
"grad_norm": 3.2239160537719727, |
|
"learning_rate": 4.091734786557675e-05, |
|
"loss": 4.7767, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5540417801998183, |
|
"grad_norm": 2.9624414443969727, |
|
"learning_rate": 4.076597033000303e-05, |
|
"loss": 4.8608, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5631244323342416, |
|
"grad_norm": 3.14367938041687, |
|
"learning_rate": 4.061459279442931e-05, |
|
"loss": 4.7909, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5722070844686649, |
|
"grad_norm": 3.664564371109009, |
|
"learning_rate": 4.046321525885558e-05, |
|
"loss": 4.7325, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5812897366030881, |
|
"grad_norm": 2.9251296520233154, |
|
"learning_rate": 4.0311837723281864e-05, |
|
"loss": 4.8017, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5903723887375113, |
|
"grad_norm": 2.8796215057373047, |
|
"learning_rate": 4.0160460187708146e-05, |
|
"loss": 4.7124, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5994550408719346, |
|
"grad_norm": 3.0257513523101807, |
|
"learning_rate": 4.000908265213443e-05, |
|
"loss": 4.7311, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6085376930063578, |
|
"grad_norm": 3.096799612045288, |
|
"learning_rate": 3.985770511656071e-05, |
|
"loss": 4.6568, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6176203451407811, |
|
"grad_norm": 3.1430232524871826, |
|
"learning_rate": 3.970632758098698e-05, |
|
"loss": 4.6451, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6267029972752044, |
|
"grad_norm": 3.0216684341430664, |
|
"learning_rate": 3.9554950045413265e-05, |
|
"loss": 4.6565, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.6357856494096276, |
|
"grad_norm": 3.0199525356292725, |
|
"learning_rate": 3.940357250983954e-05, |
|
"loss": 4.6988, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6448683015440508, |
|
"grad_norm": 2.9998953342437744, |
|
"learning_rate": 3.925219497426582e-05, |
|
"loss": 4.6654, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6539509536784741, |
|
"grad_norm": 3.15533447265625, |
|
"learning_rate": 3.91008174386921e-05, |
|
"loss": 4.616, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6630336058128974, |
|
"grad_norm": 2.8745930194854736, |
|
"learning_rate": 3.894943990311838e-05, |
|
"loss": 4.649, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6721162579473207, |
|
"grad_norm": 3.0759665966033936, |
|
"learning_rate": 3.879806236754466e-05, |
|
"loss": 4.6054, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6811989100817438, |
|
"grad_norm": 3.0508482456207275, |
|
"learning_rate": 3.864668483197093e-05, |
|
"loss": 4.4922, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6902815622161671, |
|
"grad_norm": 2.9260127544403076, |
|
"learning_rate": 3.8495307296397214e-05, |
|
"loss": 4.6469, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6993642143505904, |
|
"grad_norm": 2.924952268600464, |
|
"learning_rate": 3.8343929760823496e-05, |
|
"loss": 4.6164, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7084468664850136, |
|
"grad_norm": 3.056288480758667, |
|
"learning_rate": 3.819255222524977e-05, |
|
"loss": 4.5877, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.7175295186194369, |
|
"grad_norm": 4.257227420806885, |
|
"learning_rate": 3.804117468967605e-05, |
|
"loss": 4.6301, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.7266121707538601, |
|
"grad_norm": 3.282137155532837, |
|
"learning_rate": 3.788979715410233e-05, |
|
"loss": 4.4623, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7356948228882834, |
|
"grad_norm": 2.945059299468994, |
|
"learning_rate": 3.7738419618528615e-05, |
|
"loss": 4.6267, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7447774750227066, |
|
"grad_norm": 3.1374645233154297, |
|
"learning_rate": 3.7587042082954896e-05, |
|
"loss": 4.6835, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7538601271571299, |
|
"grad_norm": 3.21016001701355, |
|
"learning_rate": 3.743566454738117e-05, |
|
"loss": 4.5581, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7629427792915532, |
|
"grad_norm": 2.8072383403778076, |
|
"learning_rate": 3.728428701180745e-05, |
|
"loss": 4.571, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7720254314259763, |
|
"grad_norm": 2.9735002517700195, |
|
"learning_rate": 3.713290947623373e-05, |
|
"loss": 4.5013, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7811080835603996, |
|
"grad_norm": 3.182706832885742, |
|
"learning_rate": 3.698153194066001e-05, |
|
"loss": 4.534, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7901907356948229, |
|
"grad_norm": 2.958193778991699, |
|
"learning_rate": 3.683015440508629e-05, |
|
"loss": 4.5697, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7992733878292462, |
|
"grad_norm": 2.950946569442749, |
|
"learning_rate": 3.6678776869512564e-05, |
|
"loss": 4.6066, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.8083560399636693, |
|
"grad_norm": 2.9701859951019287, |
|
"learning_rate": 3.6527399333938846e-05, |
|
"loss": 4.5934, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.8174386920980926, |
|
"grad_norm": 3.2177681922912598, |
|
"learning_rate": 3.637602179836512e-05, |
|
"loss": 4.5418, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8265213442325159, |
|
"grad_norm": 2.7435505390167236, |
|
"learning_rate": 3.62246442627914e-05, |
|
"loss": 4.5485, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8356039963669392, |
|
"grad_norm": 3.4409849643707275, |
|
"learning_rate": 3.607326672721768e-05, |
|
"loss": 4.4268, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8446866485013624, |
|
"grad_norm": 3.803256034851074, |
|
"learning_rate": 3.592188919164396e-05, |
|
"loss": 4.5643, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8537693006357856, |
|
"grad_norm": 3.0399341583251953, |
|
"learning_rate": 3.5770511656070246e-05, |
|
"loss": 4.4783, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8628519527702089, |
|
"grad_norm": 2.9948980808258057, |
|
"learning_rate": 3.561913412049652e-05, |
|
"loss": 4.4929, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8719346049046321, |
|
"grad_norm": 3.400299549102783, |
|
"learning_rate": 3.54677565849228e-05, |
|
"loss": 4.4803, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8810172570390554, |
|
"grad_norm": 2.9282257556915283, |
|
"learning_rate": 3.531637904934908e-05, |
|
"loss": 4.4554, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8900999091734787, |
|
"grad_norm": 2.957598924636841, |
|
"learning_rate": 3.516500151377536e-05, |
|
"loss": 4.5324, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8991825613079019, |
|
"grad_norm": 2.9992153644561768, |
|
"learning_rate": 3.501362397820164e-05, |
|
"loss": 4.508, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.9082652134423251, |
|
"grad_norm": 3.1509618759155273, |
|
"learning_rate": 3.4862246442627914e-05, |
|
"loss": 4.4265, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9173478655767484, |
|
"grad_norm": 3.027726888656616, |
|
"learning_rate": 3.4710868907054196e-05, |
|
"loss": 4.4979, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.9264305177111717, |
|
"grad_norm": 3.0711803436279297, |
|
"learning_rate": 3.455949137148047e-05, |
|
"loss": 4.4946, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.935513169845595, |
|
"grad_norm": 2.982269287109375, |
|
"learning_rate": 3.440811383590675e-05, |
|
"loss": 4.3433, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.9445958219800181, |
|
"grad_norm": 2.9734480381011963, |
|
"learning_rate": 3.425673630033303e-05, |
|
"loss": 4.453, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9536784741144414, |
|
"grad_norm": 2.985030174255371, |
|
"learning_rate": 3.410535876475931e-05, |
|
"loss": 4.3705, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9627611262488647, |
|
"grad_norm": 3.1812829971313477, |
|
"learning_rate": 3.395398122918559e-05, |
|
"loss": 4.3414, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.971843778383288, |
|
"grad_norm": 3.415923595428467, |
|
"learning_rate": 3.380260369361187e-05, |
|
"loss": 4.522, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9809264305177112, |
|
"grad_norm": 3.176737070083618, |
|
"learning_rate": 3.3651226158038145e-05, |
|
"loss": 4.4112, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9900090826521344, |
|
"grad_norm": 3.1306254863739014, |
|
"learning_rate": 3.3499848622464433e-05, |
|
"loss": 4.5104, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9990917347865577, |
|
"grad_norm": 3.216395616531372, |
|
"learning_rate": 3.334847108689071e-05, |
|
"loss": 4.3244, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.008174386920981, |
|
"grad_norm": 3.1889307498931885, |
|
"learning_rate": 3.319709355131699e-05, |
|
"loss": 4.3521, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.017257039055404, |
|
"grad_norm": 2.8001787662506104, |
|
"learning_rate": 3.3045716015743264e-05, |
|
"loss": 4.3047, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.0263396911898275, |
|
"grad_norm": 3.5796685218811035, |
|
"learning_rate": 3.2894338480169546e-05, |
|
"loss": 4.1921, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.0354223433242506, |
|
"grad_norm": 3.725538730621338, |
|
"learning_rate": 3.274296094459583e-05, |
|
"loss": 4.3203, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.044504995458674, |
|
"grad_norm": 2.9058167934417725, |
|
"learning_rate": 3.25915834090221e-05, |
|
"loss": 4.385, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.0535876475930972, |
|
"grad_norm": 3.120119333267212, |
|
"learning_rate": 3.244020587344838e-05, |
|
"loss": 4.2883, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.0626702997275204, |
|
"grad_norm": 3.230036735534668, |
|
"learning_rate": 3.228882833787466e-05, |
|
"loss": 4.3602, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.0717529518619437, |
|
"grad_norm": 3.482921600341797, |
|
"learning_rate": 3.213745080230094e-05, |
|
"loss": 4.3984, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.080835603996367, |
|
"grad_norm": 3.0121572017669678, |
|
"learning_rate": 3.198607326672722e-05, |
|
"loss": 4.3864, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.0899182561307903, |
|
"grad_norm": 3.277411460876465, |
|
"learning_rate": 3.1834695731153495e-05, |
|
"loss": 4.2294, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0990009082652135, |
|
"grad_norm": 3.0383167266845703, |
|
"learning_rate": 3.168331819557978e-05, |
|
"loss": 4.2759, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.1080835603996366, |
|
"grad_norm": 3.3026745319366455, |
|
"learning_rate": 3.153194066000605e-05, |
|
"loss": 4.3093, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.11716621253406, |
|
"grad_norm": 2.954747200012207, |
|
"learning_rate": 3.138056312443234e-05, |
|
"loss": 4.2476, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.1262488646684832, |
|
"grad_norm": 3.2137765884399414, |
|
"learning_rate": 3.1229185588858614e-05, |
|
"loss": 4.2858, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.1353315168029066, |
|
"grad_norm": 3.4028799533843994, |
|
"learning_rate": 3.1077808053284896e-05, |
|
"loss": 4.3652, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.1444141689373297, |
|
"grad_norm": 3.0039563179016113, |
|
"learning_rate": 3.092643051771118e-05, |
|
"loss": 4.4106, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.1534968210717529, |
|
"grad_norm": 2.973820209503174, |
|
"learning_rate": 3.077505298213745e-05, |
|
"loss": 4.1827, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.1625794732061763, |
|
"grad_norm": 2.99037766456604, |
|
"learning_rate": 3.062367544656373e-05, |
|
"loss": 4.3092, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.1716621253405994, |
|
"grad_norm": 3.181398391723633, |
|
"learning_rate": 3.047229791099001e-05, |
|
"loss": 4.417, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.1807447774750228, |
|
"grad_norm": 3.1933484077453613, |
|
"learning_rate": 3.032092037541629e-05, |
|
"loss": 4.2361, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.189827429609446, |
|
"grad_norm": 3.4427855014801025, |
|
"learning_rate": 3.0169542839842567e-05, |
|
"loss": 4.2687, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.1989100817438691, |
|
"grad_norm": 3.0683298110961914, |
|
"learning_rate": 3.001816530426885e-05, |
|
"loss": 4.2748, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.2079927338782925, |
|
"grad_norm": 3.044698715209961, |
|
"learning_rate": 2.9866787768695127e-05, |
|
"loss": 4.2671, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.2170753860127157, |
|
"grad_norm": 3.1354904174804688, |
|
"learning_rate": 2.9715410233121405e-05, |
|
"loss": 4.2635, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.226158038147139, |
|
"grad_norm": 3.282745361328125, |
|
"learning_rate": 2.9564032697547683e-05, |
|
"loss": 4.3544, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.2352406902815622, |
|
"grad_norm": 3.369798183441162, |
|
"learning_rate": 2.941265516197396e-05, |
|
"loss": 4.1993, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.2443233424159854, |
|
"grad_norm": 3.395785331726074, |
|
"learning_rate": 2.9261277626400242e-05, |
|
"loss": 4.1131, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.2534059945504088, |
|
"grad_norm": 3.500697135925293, |
|
"learning_rate": 2.9109900090826524e-05, |
|
"loss": 4.192, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.262488646684832, |
|
"grad_norm": 2.94278621673584, |
|
"learning_rate": 2.8958522555252805e-05, |
|
"loss": 4.2863, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.2715712988192553, |
|
"grad_norm": 3.3217315673828125, |
|
"learning_rate": 2.8807145019679083e-05, |
|
"loss": 4.1763, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2806539509536785, |
|
"grad_norm": 3.232830762863159, |
|
"learning_rate": 2.865576748410536e-05, |
|
"loss": 4.2595, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.2897366030881017, |
|
"grad_norm": 3.3042378425598145, |
|
"learning_rate": 2.850438994853164e-05, |
|
"loss": 4.2393, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.298819255222525, |
|
"grad_norm": 3.83151912689209, |
|
"learning_rate": 2.835301241295792e-05, |
|
"loss": 4.3005, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.3079019073569482, |
|
"grad_norm": 3.245086431503296, |
|
"learning_rate": 2.82016348773842e-05, |
|
"loss": 4.205, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.3169845594913716, |
|
"grad_norm": 3.4392285346984863, |
|
"learning_rate": 2.8050257341810477e-05, |
|
"loss": 4.1964, |
|
"step": 4350 |
|
} |
|
], |
|
"logging_steps": 30, |
|
"max_steps": 9909, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 30, |
|
"total_flos": 1136555016192000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|