|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9981634527089072, |
|
"eval_steps": 500, |
|
"global_step": 1088, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018365472910927456, |
|
"grad_norm": 0.27851340305035927, |
|
"learning_rate": 9.997915740506687e-05, |
|
"loss": 1.0445, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03673094582185491, |
|
"grad_norm": 0.061819145479423056, |
|
"learning_rate": 9.991664699681799e-05, |
|
"loss": 0.5539, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05509641873278237, |
|
"grad_norm": 0.03760363481720962, |
|
"learning_rate": 9.981252089041809e-05, |
|
"loss": 0.4787, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07346189164370982, |
|
"grad_norm": 0.0313950706656064, |
|
"learning_rate": 9.966686589619751e-05, |
|
"loss": 0.468, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09182736455463728, |
|
"grad_norm": 0.022877630980033918, |
|
"learning_rate": 9.947980344727799e-05, |
|
"loss": 0.4451, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11019283746556474, |
|
"grad_norm": 0.029933452784706518, |
|
"learning_rate": 9.925148949833355e-05, |
|
"loss": 0.4469, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1285583103764922, |
|
"grad_norm": 0.023764995733481963, |
|
"learning_rate": 9.898211439557042e-05, |
|
"loss": 0.4424, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14692378328741965, |
|
"grad_norm": 0.02327001477570512, |
|
"learning_rate": 9.867190271803465e-05, |
|
"loss": 0.4336, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1652892561983471, |
|
"grad_norm": 0.024539265266611217, |
|
"learning_rate": 9.832111309037979e-05, |
|
"loss": 0.4321, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18365472910927455, |
|
"grad_norm": 0.039657236107179715, |
|
"learning_rate": 9.793003796725048e-05, |
|
"loss": 0.4384, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 0.027774934175057744, |
|
"learning_rate": 9.749900338946192e-05, |
|
"loss": 0.4283, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22038567493112948, |
|
"grad_norm": 0.02480612425885802, |
|
"learning_rate": 9.702836871217839e-05, |
|
"loss": 0.4327, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23875114784205692, |
|
"grad_norm": 0.025217057933479833, |
|
"learning_rate": 9.651852630531748e-05, |
|
"loss": 0.4191, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2571166207529844, |
|
"grad_norm": 0.042028378260899814, |
|
"learning_rate": 9.596990122642983e-05, |
|
"loss": 0.4171, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27548209366391185, |
|
"grad_norm": 0.030731629233234352, |
|
"learning_rate": 9.538295086632703e-05, |
|
"loss": 0.4273, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2938475665748393, |
|
"grad_norm": 0.06006724597666194, |
|
"learning_rate": 9.475816456775313e-05, |
|
"loss": 0.4266, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3122130394857668, |
|
"grad_norm": 0.02893865752550571, |
|
"learning_rate": 9.409606321741775e-05, |
|
"loss": 0.4196, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3305785123966942, |
|
"grad_norm": 0.026458687435744248, |
|
"learning_rate": 9.339719881173093e-05, |
|
"loss": 0.423, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34894398530762166, |
|
"grad_norm": 0.03576661132708211, |
|
"learning_rate": 9.266215399660146e-05, |
|
"loss": 0.4192, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3673094582185491, |
|
"grad_norm": 0.02652249141257209, |
|
"learning_rate": 9.189154158168292e-05, |
|
"loss": 0.4164, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3856749311294766, |
|
"grad_norm": 0.025216448544065753, |
|
"learning_rate": 9.108600402947192e-05, |
|
"loss": 0.4272, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 0.026533959286346408, |
|
"learning_rate": 9.02462129196846e-05, |
|
"loss": 0.4189, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.42240587695133147, |
|
"grad_norm": 0.026029222217085504, |
|
"learning_rate": 8.93728683893582e-05, |
|
"loss": 0.4207, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.44077134986225897, |
|
"grad_norm": 0.028412018887179133, |
|
"learning_rate": 8.846669854914396e-05, |
|
"loss": 0.417, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4591368227731864, |
|
"grad_norm": 0.026783824853837783, |
|
"learning_rate": 8.752845887627872e-05, |
|
"loss": 0.4147, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.47750229568411384, |
|
"grad_norm": 0.026947377894842927, |
|
"learning_rate": 8.655893158474055e-05, |
|
"loss": 0.4179, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.49586776859504134, |
|
"grad_norm": 0.03613738404815021, |
|
"learning_rate": 8.555892497311402e-05, |
|
"loss": 0.4068, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5142332415059688, |
|
"grad_norm": 0.027745465710559532, |
|
"learning_rate": 8.452927275070858e-05, |
|
"loss": 0.4103, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5325987144168962, |
|
"grad_norm": 0.025444027330206734, |
|
"learning_rate": 8.347083334249199e-05, |
|
"loss": 0.4009, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5509641873278237, |
|
"grad_norm": 0.029718700126100683, |
|
"learning_rate": 8.23844891734181e-05, |
|
"loss": 0.4183, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5693296602387512, |
|
"grad_norm": 0.029941432231859456, |
|
"learning_rate": 8.12711459327459e-05, |
|
"loss": 0.4199, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5876951331496786, |
|
"grad_norm": 0.02728726308576057, |
|
"learning_rate": 8.013173181896283e-05, |
|
"loss": 0.4023, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.02842461149504098, |
|
"learning_rate": 7.89671967659423e-05, |
|
"loss": 0.4185, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6244260789715336, |
|
"grad_norm": 0.02557752553781187, |
|
"learning_rate": 7.777851165098012e-05, |
|
"loss": 0.4259, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.642791551882461, |
|
"grad_norm": 0.026972728188092137, |
|
"learning_rate": 7.656666748537045e-05, |
|
"loss": 0.4039, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.024371307606132524, |
|
"learning_rate": 7.533267458819598e-05, |
|
"loss": 0.4216, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6795224977043158, |
|
"grad_norm": 0.031096864607523835, |
|
"learning_rate": 7.407756174402089e-05, |
|
"loss": 0.4112, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6978879706152433, |
|
"grad_norm": 0.027930614232647415, |
|
"learning_rate": 7.280237534518947e-05, |
|
"loss": 0.4154, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7162534435261708, |
|
"grad_norm": 0.02788401233462263, |
|
"learning_rate": 7.150817851944472e-05, |
|
"loss": 0.4079, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7346189164370982, |
|
"grad_norm": 0.030760734784571572, |
|
"learning_rate": 7.019605024359474e-05, |
|
"loss": 0.411, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7529843893480257, |
|
"grad_norm": 0.025453805261318345, |
|
"learning_rate": 6.886708444396573e-05, |
|
"loss": 0.4065, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7713498622589532, |
|
"grad_norm": 0.028933030043804563, |
|
"learning_rate": 6.75223890843913e-05, |
|
"loss": 0.4171, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7897153351698806, |
|
"grad_norm": 0.026727439486227408, |
|
"learning_rate": 6.6163085242499e-05, |
|
"loss": 0.408, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.028829235828263126, |
|
"learning_rate": 6.479030617506353e-05, |
|
"loss": 0.4148, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8264462809917356, |
|
"grad_norm": 0.02837397571927983, |
|
"learning_rate": 6.34051963732063e-05, |
|
"loss": 0.4142, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8448117539026629, |
|
"grad_norm": 0.030025509075918014, |
|
"learning_rate": 6.200891060822883e-05, |
|
"loss": 0.4248, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8631772268135904, |
|
"grad_norm": 0.02918663842326674, |
|
"learning_rate": 6.060261296887554e-05, |
|
"loss": 0.4106, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8815426997245179, |
|
"grad_norm": 0.030657261653125826, |
|
"learning_rate": 5.918747589082853e-05, |
|
"loss": 0.4118, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8999081726354453, |
|
"grad_norm": 0.029002395463699617, |
|
"learning_rate": 5.776467917924348e-05, |
|
"loss": 0.409, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9182736455463728, |
|
"grad_norm": 0.02963746695292411, |
|
"learning_rate": 5.6335409025141694e-05, |
|
"loss": 0.406, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9366391184573003, |
|
"grad_norm": 0.03089177834981146, |
|
"learning_rate": 5.490085701647805e-05, |
|
"loss": 0.4102, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9550045913682277, |
|
"grad_norm": 0.032695907122492804, |
|
"learning_rate": 5.346221914470959e-05, |
|
"loss": 0.4025, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9733700642791552, |
|
"grad_norm": 0.0283784844379621, |
|
"learning_rate": 5.2020694807693015e-05, |
|
"loss": 0.401, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9917355371900827, |
|
"grad_norm": 0.02859989539113277, |
|
"learning_rate": 5.0577485809742044e-05, |
|
"loss": 0.4163, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 0.03103627192686722, |
|
"learning_rate": 4.913379535967859e-05, |
|
"loss": 0.4039, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0284664830119377, |
|
"grad_norm": 0.03322012900664279, |
|
"learning_rate": 4.7690827067713035e-05, |
|
"loss": 0.3914, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.046831955922865, |
|
"grad_norm": 0.03642666862930741, |
|
"learning_rate": 4.6249783941989785e-05, |
|
"loss": 0.3908, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0651974288337924, |
|
"grad_norm": 0.0336389842141535, |
|
"learning_rate": 4.481186738563492e-05, |
|
"loss": 0.3916, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.08356290174472, |
|
"grad_norm": 0.03449706815272795, |
|
"learning_rate": 4.3378276195141665e-05, |
|
"loss": 0.3858, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1019283746556474, |
|
"grad_norm": 0.036410754456203426, |
|
"learning_rate": 4.195020556092935e-05, |
|
"loss": 0.3837, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.120293847566575, |
|
"grad_norm": 0.03485127798044261, |
|
"learning_rate": 4.05288460709086e-05, |
|
"loss": 0.3904, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1386593204775024, |
|
"grad_norm": 0.038047760695129655, |
|
"learning_rate": 3.911538271788358e-05, |
|
"loss": 0.3964, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1570247933884297, |
|
"grad_norm": 0.038412283865629844, |
|
"learning_rate": 3.7710993911619094e-05, |
|
"loss": 0.3774, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1753902662993572, |
|
"grad_norm": 0.036371859571502144, |
|
"learning_rate": 3.631685049639586e-05, |
|
"loss": 0.3907, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1937557392102847, |
|
"grad_norm": 0.03976682097907334, |
|
"learning_rate": 3.493411477487315e-05, |
|
"loss": 0.3858, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.037005097607228386, |
|
"learning_rate": 3.3563939539072705e-05, |
|
"loss": 0.3847, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2304866850321396, |
|
"grad_norm": 0.03996197909827653, |
|
"learning_rate": 3.2207467109291586e-05, |
|
"loss": 0.3979, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.248852157943067, |
|
"grad_norm": 0.04158286829796218, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.39, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2672176308539944, |
|
"grad_norm": 0.04139047858760264, |
|
"learning_rate": 2.9540141885736262e-05, |
|
"loss": 0.3874, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.285583103764922, |
|
"grad_norm": 0.041594762089601535, |
|
"learning_rate": 2.8231512851129593e-05, |
|
"loss": 0.3934, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3039485766758494, |
|
"grad_norm": 0.0431336076994283, |
|
"learning_rate": 2.6941032286920985e-05, |
|
"loss": 0.3775, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.322314049586777, |
|
"grad_norm": 0.04035918987424071, |
|
"learning_rate": 2.5669776071657192e-05, |
|
"loss": 0.4017, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3406795224977044, |
|
"grad_norm": 0.0417444272173153, |
|
"learning_rate": 2.4418804056472227e-05, |
|
"loss": 0.3799, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3590449954086319, |
|
"grad_norm": 0.04538570278878064, |
|
"learning_rate": 2.3189159181485516e-05, |
|
"loss": 0.3921, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3774104683195592, |
|
"grad_norm": 0.043013131185681716, |
|
"learning_rate": 2.1981866606298683e-05, |
|
"loss": 0.3857, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3957759412304866, |
|
"grad_norm": 0.043552361092639345, |
|
"learning_rate": 2.079793285531618e-05, |
|
"loss": 0.3937, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4141414141414141, |
|
"grad_norm": 0.041014272519928036, |
|
"learning_rate": 1.963834497860192e-05, |
|
"loss": 0.3801, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4325068870523416, |
|
"grad_norm": 0.03865452597618009, |
|
"learning_rate": 1.8504069728972123e-05, |
|
"loss": 0.3834, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4508723599632691, |
|
"grad_norm": 0.04173720509030274, |
|
"learning_rate": 1.7396052756009574e-05, |
|
"loss": 0.3829, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.4692378328741964, |
|
"grad_norm": 0.04256905712895654, |
|
"learning_rate": 1.631521781767214e-05, |
|
"loss": 0.3795, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.487603305785124, |
|
"grad_norm": 0.0473389022063686, |
|
"learning_rate": 1.52624660101522e-05, |
|
"loss": 0.3801, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5059687786960514, |
|
"grad_norm": 0.0391194792265315, |
|
"learning_rate": 1.4238675016629338e-05, |
|
"loss": 0.3803, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5243342516069789, |
|
"grad_norm": 0.046767033474025464, |
|
"learning_rate": 1.3244698375542491e-05, |
|
"loss": 0.3963, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5426997245179064, |
|
"grad_norm": 0.042865074045739464, |
|
"learning_rate": 1.2281364768991804e-05, |
|
"loss": 0.3765, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5610651974288339, |
|
"grad_norm": 0.04190703980014869, |
|
"learning_rate": 1.134947733186315e-05, |
|
"loss": 0.3761, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5794306703397614, |
|
"grad_norm": 0.04183784386577142, |
|
"learning_rate": 1.0449812982251556e-05, |
|
"loss": 0.3852, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5977961432506889, |
|
"grad_norm": 0.03879386200844887, |
|
"learning_rate": 9.58312177374157e-06, |
|
"loss": 0.3775, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 0.0412465475003067, |
|
"learning_rate": 8.75012627008489e-06, |
|
"loss": 0.3818, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6345270890725436, |
|
"grad_norm": 0.04486626572785468, |
|
"learning_rate": 7.951520942796025e-06, |
|
"loss": 0.3871, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6528925619834711, |
|
"grad_norm": 0.040934222197984614, |
|
"learning_rate": 7.187971592168935e-06, |
|
"loss": 0.3739, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6712580348943984, |
|
"grad_norm": 0.0440521821621867, |
|
"learning_rate": 6.460114792196642e-06, |
|
"loss": 0.4001, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.6896235078053259, |
|
"grad_norm": 0.04130542221023784, |
|
"learning_rate": 5.768557359857241e-06, |
|
"loss": 0.3833, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7079889807162534, |
|
"grad_norm": 0.041016879516671184, |
|
"learning_rate": 5.113875849208099e-06, |
|
"loss": 0.3871, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7263544536271809, |
|
"grad_norm": 0.04458130921484639, |
|
"learning_rate": 4.4966160707107076e-06, |
|
"loss": 0.3924, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7447199265381084, |
|
"grad_norm": 0.04483909598982454, |
|
"learning_rate": 3.917292636186332e-06, |
|
"loss": 0.3868, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7630853994490359, |
|
"grad_norm": 0.04374641202578716, |
|
"learning_rate": 3.376388529782215e-06, |
|
"loss": 0.3799, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7814508723599634, |
|
"grad_norm": 0.04643290880252759, |
|
"learning_rate": 2.8743547053058427e-06, |
|
"loss": 0.3873, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.7998163452708908, |
|
"grad_norm": 0.04253533673018247, |
|
"learning_rate": 2.4116097102630907e-06, |
|
"loss": 0.3851, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.039915967047357025, |
|
"learning_rate": 1.9885393369134976e-06, |
|
"loss": 0.3773, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8365472910927456, |
|
"grad_norm": 0.04344253083810866, |
|
"learning_rate": 1.6054963006338742e-06, |
|
"loss": 0.3808, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.854912764003673, |
|
"grad_norm": 0.040517415128882496, |
|
"learning_rate": 1.2627999458580953e-06, |
|
"loss": 0.376, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8732782369146006, |
|
"grad_norm": 0.040775411667955175, |
|
"learning_rate": 9.607359798384785e-07, |
|
"loss": 0.3824, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.8916437098255279, |
|
"grad_norm": 0.04533369281933566, |
|
"learning_rate": 6.995562344505214e-07, |
|
"loss": 0.3894, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9100091827364554, |
|
"grad_norm": 0.0469790386931614, |
|
"learning_rate": 4.794784562397458e-07, |
|
"loss": 0.3746, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9283746556473829, |
|
"grad_norm": 0.04179832849248068, |
|
"learning_rate": 3.006861248855408e-07, |
|
"loss": 0.3904, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9467401285583104, |
|
"grad_norm": 0.04097669604600613, |
|
"learning_rate": 1.6332830023350064e-07, |
|
"loss": 0.3905, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9651056014692379, |
|
"grad_norm": 0.043233457078553805, |
|
"learning_rate": 6.751949802362711e-08, |
|
"loss": 0.3777, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9834710743801653, |
|
"grad_norm": 0.0432832073055713, |
|
"learning_rate": 1.3339594418138035e-08, |
|
"loss": 0.3719, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.9981634527089072, |
|
"step": 1088, |
|
"total_flos": 525248790528000.0, |
|
"train_loss": 0.4096706003388938, |
|
"train_runtime": 32183.0135, |
|
"train_samples_per_second": 4.058, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1088, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 5000, |
|
"total_flos": 525248790528000.0, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|