|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9946777054997042, |
|
"eval_steps": 500, |
|
"global_step": 1266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02365464222353637, |
|
"grad_norm": 4.438574633904732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8864, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04730928444707274, |
|
"grad_norm": 3.743436107422786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7865, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0709639266706091, |
|
"grad_norm": 1.3773266130731683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7718, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09461856889414548, |
|
"grad_norm": 1.0328014979269187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7321, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11827321111768184, |
|
"grad_norm": 0.9419322611015702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7263, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1419278533412182, |
|
"grad_norm": 1.0108429467163185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7095, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16558249556475457, |
|
"grad_norm": 0.7116397296212089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7076, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18923713778829096, |
|
"grad_norm": 0.6226004481870809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6923, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21289178001182732, |
|
"grad_norm": 0.7788334547916798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6939, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23654642223536368, |
|
"grad_norm": 0.5579006889125326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6916, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26020106445890007, |
|
"grad_norm": 0.8348224630921526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6841, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2838557066824364, |
|
"grad_norm": 1.0981466464621934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6822, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3075103489059728, |
|
"grad_norm": 0.8917486118802607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6761, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33116499112950915, |
|
"grad_norm": 0.6543556998725211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6715, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35481963335304556, |
|
"grad_norm": 0.8125188565799322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6773, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3784742755765819, |
|
"grad_norm": 0.5688602721338011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.674, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4021289178001183, |
|
"grad_norm": 0.491756225050921, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6815, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42578356002365464, |
|
"grad_norm": 0.6509377411475789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.669, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.6207735885705108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6755, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47309284447072736, |
|
"grad_norm": 0.528808960645126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6674, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4967474866942638, |
|
"grad_norm": 0.8472585224068009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6637, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5204021289178001, |
|
"grad_norm": 0.5848774410670773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6681, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5440567711413364, |
|
"grad_norm": 0.6344280914148243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6675, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5677114133648729, |
|
"grad_norm": 0.7485250474806475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6549, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5913660555884093, |
|
"grad_norm": 0.7332796111883003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6150206978119456, |
|
"grad_norm": 0.5097881381268425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6603, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.638675340035482, |
|
"grad_norm": 0.49176638410796597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6697, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6623299822590183, |
|
"grad_norm": 0.4404068786810332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6547, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6859846244825547, |
|
"grad_norm": 0.45956579197536424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6617, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7096392667060911, |
|
"grad_norm": 0.5489471352518822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.65, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7332939089296274, |
|
"grad_norm": 0.5071925000559494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6639, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7569485511531638, |
|
"grad_norm": 0.4479532221316009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7806031933767001, |
|
"grad_norm": 0.5768991016278898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6601, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8042578356002366, |
|
"grad_norm": 0.47440990509293773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6457, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8279124778237729, |
|
"grad_norm": 0.5535872360742707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6552, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8515671200473093, |
|
"grad_norm": 0.5692309710225549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6533, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8752217622708457, |
|
"grad_norm": 0.477161966378483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.5742339288892304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9225310467179184, |
|
"grad_norm": 0.6276883665595651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6524, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9461856889414547, |
|
"grad_norm": 0.5739867998426434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.645, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9698403311649911, |
|
"grad_norm": 0.6714991896688636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6552, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9934949733885275, |
|
"grad_norm": 0.5532850958647405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9982259018332348, |
|
"eval_loss": 0.6549943685531616, |
|
"eval_runtime": 226.5831, |
|
"eval_samples_per_second": 50.255, |
|
"eval_steps_per_second": 0.393, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0171496156120639, |
|
"grad_norm": 0.6577034498689445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0408042578356003, |
|
"grad_norm": 0.6919211184212087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.611, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0644589000591367, |
|
"grad_norm": 0.49818432774881954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0881135422826729, |
|
"grad_norm": 0.5304613381757841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6118, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1117681845062093, |
|
"grad_norm": 0.5661240677080396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6069, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1354228267297457, |
|
"grad_norm": 0.4725657759678031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6081, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1590774689532821, |
|
"grad_norm": 0.5348982555181953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1827321111768185, |
|
"grad_norm": 0.7009257467225577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6107, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2063867534003547, |
|
"grad_norm": 0.516755234113577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6145, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2300413956238911, |
|
"grad_norm": 0.5565870238553596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6064, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2536960378474276, |
|
"grad_norm": 0.5176934237005286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6078, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.277350680070964, |
|
"grad_norm": 0.5399011019791115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6173, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3010053222945004, |
|
"grad_norm": 0.48804065232921706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6089, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3246599645180366, |
|
"grad_norm": 0.6247022748083035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.5205181494692162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6087, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3719692489651094, |
|
"grad_norm": 0.4444906716754459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6129, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3956238911886458, |
|
"grad_norm": 0.4699507974891951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4192785334121822, |
|
"grad_norm": 0.438759746705871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6118, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4429331756357184, |
|
"grad_norm": 0.492167276336904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4665878178592548, |
|
"grad_norm": 0.508896134049524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4902424600827913, |
|
"grad_norm": 0.5044935497801236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6091, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5138971023063275, |
|
"grad_norm": 0.611513828523435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.537551744529864, |
|
"grad_norm": 0.4284886945389684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6161, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5612063867534003, |
|
"grad_norm": 0.7397737868115762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.611, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5848610289769367, |
|
"grad_norm": 0.45834776518516607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6085156712004731, |
|
"grad_norm": 0.9830107568320281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6077, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6321703134240093, |
|
"grad_norm": 0.7316301006660809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.655824955647546, |
|
"grad_norm": 0.8713979852654485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6112, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6794795978710821, |
|
"grad_norm": 0.6609904307136948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6063, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7031342400946186, |
|
"grad_norm": 0.5730385784454821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.726788882318155, |
|
"grad_norm": 0.428675321624077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6075, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7504435245416912, |
|
"grad_norm": 0.5747845078803645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6073, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7740981667652278, |
|
"grad_norm": 0.6019599666582006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6074, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.484871680178572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6076, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8214074512123004, |
|
"grad_norm": 0.4801879662753807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8450620934358368, |
|
"grad_norm": 0.44588625446373603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6078, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.868716735659373, |
|
"grad_norm": 0.43921853360548113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8923713778829097, |
|
"grad_norm": 0.4461189307976923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9160260201064458, |
|
"grad_norm": 0.4949761836327779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.613, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9396806623299823, |
|
"grad_norm": 0.44062812260467765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6049, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9633353045535187, |
|
"grad_norm": 0.4460160929815086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.613, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9869899467770549, |
|
"grad_norm": 0.5175110899521405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6028, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9988172678888232, |
|
"eval_loss": 0.6445377469062805, |
|
"eval_runtime": 227.0244, |
|
"eval_samples_per_second": 50.158, |
|
"eval_steps_per_second": 0.392, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0106445890005915, |
|
"grad_norm": 0.5892375412389526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5851, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0342992312241277, |
|
"grad_norm": 0.5097516172118646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.057953873447664, |
|
"grad_norm": 0.6536746176311915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5592, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0816085156712005, |
|
"grad_norm": 0.47983268810356666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.6017282349204336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5697, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1289178001182734, |
|
"grad_norm": 0.5728407157654074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1525724423418096, |
|
"grad_norm": 0.5680779384221303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1762270845653457, |
|
"grad_norm": 0.48858908601906337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.567, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1998817267888824, |
|
"grad_norm": 0.5005707887249943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5646, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2235363690124186, |
|
"grad_norm": 0.5829558904651037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.48798199303667406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.578, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.2708456534594914, |
|
"grad_norm": 0.582446153234459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5682, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2945002956830276, |
|
"grad_norm": 0.46970294592756995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.571, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3181549379065642, |
|
"grad_norm": 0.5759020549520256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3418095801301004, |
|
"grad_norm": 0.549340588982862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.365464222353637, |
|
"grad_norm": 0.46429208051701265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3891188645771733, |
|
"grad_norm": 0.5160254392452897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.558, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4127735068007095, |
|
"grad_norm": 0.4799281597192369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5627, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.436428149024246, |
|
"grad_norm": 0.5121330286207769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5608, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4600827912477823, |
|
"grad_norm": 0.5841580086447481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.483737433471319, |
|
"grad_norm": 0.557020183414569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5726, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.507392075694855, |
|
"grad_norm": 0.6374112998842234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5747, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5310467179183913, |
|
"grad_norm": 0.5343754869995426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5607, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.554701360141928, |
|
"grad_norm": 0.5496554790900547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.578356002365464, |
|
"grad_norm": 0.6822773581077988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5738, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6020106445890008, |
|
"grad_norm": 0.49632724788385346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.625665286812537, |
|
"grad_norm": 0.4859614320386073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5704, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.649319929036073, |
|
"grad_norm": 0.5005521245693028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.67297457125961, |
|
"grad_norm": 0.5418331476470847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5719, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.4518235189693759, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7202838557066826, |
|
"grad_norm": 0.45229828628235735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5638, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.743938497930219, |
|
"grad_norm": 0.5051148207876189, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.767593140153755, |
|
"grad_norm": 0.7455413514573421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.7912477823772917, |
|
"grad_norm": 0.5362145068936747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5705, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.814902424600828, |
|
"grad_norm": 0.46118669511344673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5653, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8385570668243645, |
|
"grad_norm": 0.5498761802579338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8622117090479007, |
|
"grad_norm": 0.5720658060375756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5761, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.885866351271437, |
|
"grad_norm": 0.4735883791639776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9095209934949735, |
|
"grad_norm": 0.6126626053091963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9331756357185097, |
|
"grad_norm": 0.5724885076669786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9568302779420463, |
|
"grad_norm": 0.5144727847784881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9804849201655825, |
|
"grad_norm": 0.4637250585550989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5645, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"eval_loss": 0.6453979015350342, |
|
"eval_runtime": 226.895, |
|
"eval_samples_per_second": 50.186, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 2.9946777054997042, |
|
"step": 1266, |
|
"total_flos": 2120178393415680.0, |
|
"train_loss": 0.6197805719164687, |
|
"train_runtime": 38167.6556, |
|
"train_samples_per_second": 17.005, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2120178393415680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|