|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1179, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02544529262086514, |
|
"grad_norm": 3.0892276763916016, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8937, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05089058524173028, |
|
"grad_norm": 1.0776044130325317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7655, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07633587786259542, |
|
"grad_norm": 2.4046175479888916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7156, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10178117048346055, |
|
"grad_norm": 1.0319740772247314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6884, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1272264631043257, |
|
"grad_norm": 0.8365700840950012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6666, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15267175572519084, |
|
"grad_norm": 1.0443907976150513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6517, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.178117048346056, |
|
"grad_norm": 0.6737371683120728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2035623409669211, |
|
"grad_norm": 0.7133864760398865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6313, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22900763358778625, |
|
"grad_norm": 0.873466432094574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6223, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2544529262086514, |
|
"grad_norm": 0.8450167179107666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6252, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27989821882951654, |
|
"grad_norm": 0.5868242383003235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6153, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3053435114503817, |
|
"grad_norm": 0.6943073868751526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6155, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33078880407124683, |
|
"grad_norm": 0.568712592124939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.356234096692112, |
|
"grad_norm": 0.5304349660873413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3816793893129771, |
|
"grad_norm": 0.5914269089698792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6014, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4071246819338422, |
|
"grad_norm": 0.6255616545677185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.43256997455470736, |
|
"grad_norm": 0.6703746318817139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.595, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4580152671755725, |
|
"grad_norm": 0.6239282488822937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5961, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.48346055979643765, |
|
"grad_norm": 0.5409078598022461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5936, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5089058524173028, |
|
"grad_norm": 0.47156909108161926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5871, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5343511450381679, |
|
"grad_norm": 0.6108508110046387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5936, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5597964376590331, |
|
"grad_norm": 0.510125994682312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5804, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5852417302798982, |
|
"grad_norm": 0.532505214214325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.583, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6106870229007634, |
|
"grad_norm": 0.732284426689148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5817, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6361323155216285, |
|
"grad_norm": 0.624311089515686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5821, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6615776081424937, |
|
"grad_norm": 0.6264485120773315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5805, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6870229007633588, |
|
"grad_norm": 0.5334043502807617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5818, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.712468193384224, |
|
"grad_norm": 0.5711916089057922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5768, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7379134860050891, |
|
"grad_norm": 0.4914567172527313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5733, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7633587786259542, |
|
"grad_norm": 0.4886472523212433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5827, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7888040712468194, |
|
"grad_norm": 0.5745688080787659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5756, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8142493638676844, |
|
"grad_norm": 0.6722710132598877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5722, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8396946564885496, |
|
"grad_norm": 0.5262378454208374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5704, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8651399491094147, |
|
"grad_norm": 0.5720959305763245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5739, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8905852417302799, |
|
"grad_norm": 0.525618314743042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.916030534351145, |
|
"grad_norm": 0.5551468133926392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5726, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9414758269720102, |
|
"grad_norm": 0.5689135193824768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5644, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9669211195928753, |
|
"grad_norm": 0.5880911946296692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9923664122137404, |
|
"grad_norm": 0.5270939469337463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5649, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5690909624099731, |
|
"eval_runtime": 35.8687, |
|
"eval_samples_per_second": 294.602, |
|
"eval_steps_per_second": 1.171, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0178117048346056, |
|
"grad_norm": 0.6128649115562439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5446, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0432569974554706, |
|
"grad_norm": 0.6446624398231506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5287, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0687022900763359, |
|
"grad_norm": 0.6626042127609253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.528, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.094147582697201, |
|
"grad_norm": 0.6851757168769836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5275, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1195928753180662, |
|
"grad_norm": 0.5607665181159973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5317, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1450381679389312, |
|
"grad_norm": 0.5421668887138367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5279, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1704834605597965, |
|
"grad_norm": 0.5087480545043945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5323, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1959287531806615, |
|
"grad_norm": 0.6041454672813416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5221, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2213740458015268, |
|
"grad_norm": 0.5389483571052551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5296, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2468193384223918, |
|
"grad_norm": 0.5498439073562622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5236, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.272264631043257, |
|
"grad_norm": 0.534608781337738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5274, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.297709923664122, |
|
"grad_norm": 0.5173637866973877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5234, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3231552162849873, |
|
"grad_norm": 0.640381932258606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5191, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3486005089058524, |
|
"grad_norm": 0.636634111404419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5274, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3740458015267176, |
|
"grad_norm": 0.4825289249420166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5271, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3994910941475827, |
|
"grad_norm": 0.7268803715705872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5239, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.424936386768448, |
|
"grad_norm": 0.5402505993843079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5212, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.450381679389313, |
|
"grad_norm": 0.5150734186172485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5147, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4758269720101782, |
|
"grad_norm": 0.5088803768157959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5238, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5012722646310432, |
|
"grad_norm": 0.5912160277366638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5226, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5267175572519083, |
|
"grad_norm": 0.5235996842384338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5161, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5521628498727735, |
|
"grad_norm": 0.6075550317764282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5157, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5776081424936388, |
|
"grad_norm": 0.5467493534088135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5173, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6030534351145038, |
|
"grad_norm": 0.5767125487327576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.518, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6284987277353689, |
|
"grad_norm": 0.6012499332427979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5136, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6539440203562341, |
|
"grad_norm": 0.5314013361930847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5175, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6793893129770994, |
|
"grad_norm": 0.6663753390312195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5211, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7048346055979644, |
|
"grad_norm": 0.6120356917381287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7302798982188294, |
|
"grad_norm": 0.6509705781936646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5177, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7557251908396947, |
|
"grad_norm": 0.5193184614181519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5147, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.78117048346056, |
|
"grad_norm": 0.5160722136497498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5167, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.806615776081425, |
|
"grad_norm": 0.6566229462623596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5116, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.83206106870229, |
|
"grad_norm": 0.4844520688056946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5125, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8575063613231553, |
|
"grad_norm": 0.5609481334686279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8829516539440203, |
|
"grad_norm": 0.4838588535785675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5111, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9083969465648853, |
|
"grad_norm": 0.5089361667633057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5147, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9338422391857506, |
|
"grad_norm": 0.4935343861579895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5092, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9592875318066159, |
|
"grad_norm": 0.5261263251304626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5092, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.984732824427481, |
|
"grad_norm": 0.5327720642089844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5093, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5446056127548218, |
|
"eval_runtime": 35.403, |
|
"eval_samples_per_second": 298.478, |
|
"eval_steps_per_second": 1.186, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.010178117048346, |
|
"grad_norm": 0.7140676975250244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4964, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.035623409669211, |
|
"grad_norm": 0.5526481866836548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4699, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0610687022900764, |
|
"grad_norm": 0.8097620010375977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4747, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.0865139949109412, |
|
"grad_norm": 0.5774556994438171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4731, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.1119592875318065, |
|
"grad_norm": 0.7799936532974243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.481, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.1374045801526718, |
|
"grad_norm": 0.7324852347373962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.472, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.162849872773537, |
|
"grad_norm": 0.6810862421989441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4788, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.188295165394402, |
|
"grad_norm": 0.6535518169403076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4787, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.213740458015267, |
|
"grad_norm": 0.5570102334022522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2391857506361323, |
|
"grad_norm": 0.5471156239509583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4817, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2646310432569976, |
|
"grad_norm": 0.5434932112693787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4732, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.2900763358778624, |
|
"grad_norm": 0.5234562754631042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4742, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3155216284987277, |
|
"grad_norm": 0.5137863159179688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4725, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.340966921119593, |
|
"grad_norm": 0.5251743197441101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4793, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.366412213740458, |
|
"grad_norm": 0.5063709020614624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4735, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.391857506361323, |
|
"grad_norm": 0.5757549405097961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4728, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.4173027989821882, |
|
"grad_norm": 0.48364320397377014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4785, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4427480916030535, |
|
"grad_norm": 0.5600482225418091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4748, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.4681933842239188, |
|
"grad_norm": 0.5102260708808899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4752, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.4936386768447836, |
|
"grad_norm": 0.642202615737915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4743, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.519083969465649, |
|
"grad_norm": 0.7704691886901855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4743, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.544529262086514, |
|
"grad_norm": 0.6885623931884766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4742, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.569974554707379, |
|
"grad_norm": 0.5846525430679321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.595419847328244, |
|
"grad_norm": 0.66885906457901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4758, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.6208651399491094, |
|
"grad_norm": 0.5633515119552612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.474, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6463104325699747, |
|
"grad_norm": 0.5127110481262207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.67175572519084, |
|
"grad_norm": 0.6211147308349609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4721, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6972010178117047, |
|
"grad_norm": 0.700775146484375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4731, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.72264631043257, |
|
"grad_norm": 0.5291451215744019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.475, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7480916030534353, |
|
"grad_norm": 0.5429026484489441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4822, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.7735368956743, |
|
"grad_norm": 0.5198021531105042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4738, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.7989821882951653, |
|
"grad_norm": 0.4582120478153229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4709, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.8244274809160306, |
|
"grad_norm": 0.47587454319000244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4692, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.849872773536896, |
|
"grad_norm": 0.5221173167228699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4767, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.875318066157761, |
|
"grad_norm": 0.5617064833641052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.478, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.900763358778626, |
|
"grad_norm": 0.5362435579299927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4745, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.926208651399491, |
|
"grad_norm": 0.5266700387001038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4725, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.9516539440203564, |
|
"grad_norm": 0.5272775292396545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4744, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.9770992366412212, |
|
"grad_norm": 0.6244516372680664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4766, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5409571528434753, |
|
"eval_runtime": 35.9061, |
|
"eval_samples_per_second": 294.296, |
|
"eval_steps_per_second": 1.17, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1179, |
|
"total_flos": 5.566873103382662e+19, |
|
"train_loss": 0.5357507543102779, |
|
"train_runtime": 8300.7415, |
|
"train_samples_per_second": 72.558, |
|
"train_steps_per_second": 0.142 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1179, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.566873103382662e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|