|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997049277072882, |
|
"eval_steps": 500, |
|
"global_step": 847, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011802891708468575, |
|
"grad_norm": 2.2601828575134277, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 3.4593, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02360578341693715, |
|
"grad_norm": 2.753499746322632, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9512, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03540867512540572, |
|
"grad_norm": 1.42862069606781, |
|
"learning_rate": 1.588235294117647e-05, |
|
"loss": 2.2602, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0472115668338743, |
|
"grad_norm": 2.2324745655059814, |
|
"learning_rate": 2.1764705882352943e-05, |
|
"loss": 1.9542, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05901445854234287, |
|
"grad_norm": 0.9986198544502258, |
|
"learning_rate": 2.7647058823529416e-05, |
|
"loss": 1.7491, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07081735025081144, |
|
"grad_norm": 1.7015349864959717, |
|
"learning_rate": 3.352941176470588e-05, |
|
"loss": 1.7127, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08262024195928003, |
|
"grad_norm": 1.410741925239563, |
|
"learning_rate": 3.9411764705882356e-05, |
|
"loss": 1.5587, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0944231336677486, |
|
"grad_norm": 1.4795621633529663, |
|
"learning_rate": 4.5294117647058826e-05, |
|
"loss": 1.6199, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10622602537621717, |
|
"grad_norm": 1.2762919664382935, |
|
"learning_rate": 4.999915012051437e-05, |
|
"loss": 1.5481, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11802891708468574, |
|
"grad_norm": 1.1632708311080933, |
|
"learning_rate": 4.996941040535653e-05, |
|
"loss": 1.4648, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1298318087931543, |
|
"grad_norm": 1.950888752937317, |
|
"learning_rate": 4.989723448187131e-05, |
|
"loss": 1.5273, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14163470050162288, |
|
"grad_norm": 1.0861647129058838, |
|
"learning_rate": 4.978274501505061e-05, |
|
"loss": 1.5472, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15343759221009148, |
|
"grad_norm": 1.347829818725586, |
|
"learning_rate": 4.962613658293158e-05, |
|
"loss": 1.5428, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16524048391856005, |
|
"grad_norm": 1.014880657196045, |
|
"learning_rate": 4.942767534590581e-05, |
|
"loss": 1.3644, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17704337562702863, |
|
"grad_norm": 1.7476609945297241, |
|
"learning_rate": 4.918769859437232e-05, |
|
"loss": 1.4653, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1888462673354972, |
|
"grad_norm": 1.0597649812698364, |
|
"learning_rate": 4.890661417550319e-05, |
|
"loss": 1.5003, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20064915904396577, |
|
"grad_norm": 1.098489761352539, |
|
"learning_rate": 4.8584899800095864e-05, |
|
"loss": 1.462, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21245205075243434, |
|
"grad_norm": 0.8845277428627014, |
|
"learning_rate": 4.822310223069039e-05, |
|
"loss": 1.3805, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2242549424609029, |
|
"grad_norm": 1.181378960609436, |
|
"learning_rate": 4.782183635233124e-05, |
|
"loss": 1.4678, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23605783416937148, |
|
"grad_norm": 1.3490713834762573, |
|
"learning_rate": 4.738178412755306e-05, |
|
"loss": 1.378, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24786072587784008, |
|
"grad_norm": 1.1515178680419922, |
|
"learning_rate": 4.690369343736636e-05, |
|
"loss": 1.4216, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2596636175863086, |
|
"grad_norm": 1.3116216659545898, |
|
"learning_rate": 4.6388376810212905e-05, |
|
"loss": 1.3915, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2714665092947772, |
|
"grad_norm": 1.6775559186935425, |
|
"learning_rate": 4.583671004105096e-05, |
|
"loss": 1.4104, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28326940100324577, |
|
"grad_norm": 1.4933921098709106, |
|
"learning_rate": 4.524963070291744e-05, |
|
"loss": 1.411, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29507229271171437, |
|
"grad_norm": 1.057346224784851, |
|
"learning_rate": 4.4628136553496375e-05, |
|
"loss": 1.3628, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30687518442018297, |
|
"grad_norm": 1.389323353767395, |
|
"learning_rate": 4.397328383940196e-05, |
|
"loss": 1.331, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3186780761286515, |
|
"grad_norm": 1.2937726974487305, |
|
"learning_rate": 4.328618550105802e-05, |
|
"loss": 1.338, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3304809678371201, |
|
"grad_norm": 1.0137064456939697, |
|
"learning_rate": 4.256800928122475e-05, |
|
"loss": 1.4131, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34228385954558865, |
|
"grad_norm": 1.2065379619598389, |
|
"learning_rate": 4.181997574038741e-05, |
|
"loss": 1.3584, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35408675125405725, |
|
"grad_norm": 1.0482007265090942, |
|
"learning_rate": 4.104335618237972e-05, |
|
"loss": 1.3541, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3658896429625258, |
|
"grad_norm": 1.1176925897598267, |
|
"learning_rate": 4.0239470493767704e-05, |
|
"loss": 1.359, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3776925346709944, |
|
"grad_norm": 0.9922409653663635, |
|
"learning_rate": 3.940968490066559e-05, |
|
"loss": 1.261, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.389495426379463, |
|
"grad_norm": 1.4820419549942017, |
|
"learning_rate": 3.855540964679658e-05, |
|
"loss": 1.2903, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40129831808793154, |
|
"grad_norm": 1.443935751914978, |
|
"learning_rate": 3.767809659674433e-05, |
|
"loss": 1.3593, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41310120979640014, |
|
"grad_norm": 2.1749682426452637, |
|
"learning_rate": 3.677923676846864e-05, |
|
"loss": 1.3608, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4249041015048687, |
|
"grad_norm": 1.4614121913909912, |
|
"learning_rate": 3.586035779927896e-05, |
|
"loss": 1.2742, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4367069932133373, |
|
"grad_norm": 1.1197657585144043, |
|
"learning_rate": 3.492302134957218e-05, |
|
"loss": 1.3217, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4485098849218058, |
|
"grad_norm": 1.1603928804397583, |
|
"learning_rate": 3.396882044874736e-05, |
|
"loss": 1.2824, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4603127766302744, |
|
"grad_norm": 1.6083821058273315, |
|
"learning_rate": 3.2999376787807864e-05, |
|
"loss": 1.344, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47211566833874297, |
|
"grad_norm": 1.56455397605896, |
|
"learning_rate": 3.201633796325233e-05, |
|
"loss": 1.3372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48391856004721157, |
|
"grad_norm": 1.4750654697418213, |
|
"learning_rate": 3.1021374676938584e-05, |
|
"loss": 1.33, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.49572145175568016, |
|
"grad_norm": 1.2510316371917725, |
|
"learning_rate": 3.0016177896679255e-05, |
|
"loss": 1.2919, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5075243434641488, |
|
"grad_norm": 1.2658268213272095, |
|
"learning_rate": 2.9002455982394944e-05, |
|
"loss": 1.2649, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5193272351726173, |
|
"grad_norm": 1.905948519706726, |
|
"learning_rate": 2.798193178270889e-05, |
|
"loss": 1.3047, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5311301268810859, |
|
"grad_norm": 1.2382662296295166, |
|
"learning_rate": 2.695633970691786e-05, |
|
"loss": 1.2862, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5429330185895545, |
|
"grad_norm": 1.2122917175292969, |
|
"learning_rate": 2.592742277731513e-05, |
|
"loss": 1.2843, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.554735910298023, |
|
"grad_norm": 1.3638920783996582, |
|
"learning_rate": 2.489692966687566e-05, |
|
"loss": 1.2795, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5665388020064915, |
|
"grad_norm": 1.2353355884552002, |
|
"learning_rate": 2.386661172733762e-05, |
|
"loss": 1.1897, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5783416937149601, |
|
"grad_norm": 1.440238118171692, |
|
"learning_rate": 2.2838220012731365e-05, |
|
"loss": 1.3352, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5901445854234287, |
|
"grad_norm": 1.302875280380249, |
|
"learning_rate": 2.1813502303414306e-05, |
|
"loss": 1.2552, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6019474771318973, |
|
"grad_norm": 1.3485292196273804, |
|
"learning_rate": 2.0794200135669584e-05, |
|
"loss": 1.2573, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6137503688403659, |
|
"grad_norm": 2.1018223762512207, |
|
"learning_rate": 1.9782045841916625e-05, |
|
"loss": 1.2564, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6255532605488344, |
|
"grad_norm": 1.6336476802825928, |
|
"learning_rate": 1.877875960656394e-05, |
|
"loss": 1.1512, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.637356152257303, |
|
"grad_norm": 1.4360566139221191, |
|
"learning_rate": 1.7786046542507843e-05, |
|
"loss": 1.2434, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6491590439657716, |
|
"grad_norm": 1.1216990947723389, |
|
"learning_rate": 1.680559379324558e-05, |
|
"loss": 1.325, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6609619356742402, |
|
"grad_norm": 1.6999801397323608, |
|
"learning_rate": 1.583906766552799e-05, |
|
"loss": 1.2197, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6727648273827088, |
|
"grad_norm": 1.4907481670379639, |
|
"learning_rate": 1.4888110797424782e-05, |
|
"loss": 1.2821, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6845677190911773, |
|
"grad_norm": 1.2150344848632812, |
|
"learning_rate": 1.3954339366615334e-05, |
|
"loss": 1.239, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6963706107996459, |
|
"grad_norm": 1.6709622144699097, |
|
"learning_rate": 1.303934034364983e-05, |
|
"loss": 1.2403, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7081735025081145, |
|
"grad_norm": 1.5160703659057617, |
|
"learning_rate": 1.21446687948485e-05, |
|
"loss": 1.2466, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7199763942165831, |
|
"grad_norm": 1.2667752504348755, |
|
"learning_rate": 1.1271845239423196e-05, |
|
"loss": 1.1662, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7317792859250516, |
|
"grad_norm": 1.685145616531372, |
|
"learning_rate": 1.0422353065312573e-05, |
|
"loss": 1.3161, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7435821776335202, |
|
"grad_norm": 1.5131856203079224, |
|
"learning_rate": 9.59763600812305e-06, |
|
"loss": 1.2608, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7553850693419888, |
|
"grad_norm": 1.2261701822280884, |
|
"learning_rate": 8.79909569745987e-06, |
|
"loss": 1.1507, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7671879610504574, |
|
"grad_norm": 1.2804995775222778, |
|
"learning_rate": 8.028089274818624e-06, |
|
"loss": 1.3008, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.778990852758926, |
|
"grad_norm": 1.3678828477859497, |
|
"learning_rate": 7.285927087085423e-06, |
|
"loss": 1.272, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7907937444673945, |
|
"grad_norm": 1.3345593214035034, |
|
"learning_rate": 6.5738704595659065e-06, |
|
"loss": 1.1615, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8025966361758631, |
|
"grad_norm": 1.2585678100585938, |
|
"learning_rate": 5.893129552327781e-06, |
|
"loss": 1.1878, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8143995278843317, |
|
"grad_norm": 1.3462913036346436, |
|
"learning_rate": 5.244861303500026e-06, |
|
"loss": 1.2436, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8262024195928003, |
|
"grad_norm": 1.1118088960647583, |
|
"learning_rate": 4.630167463024393e-06, |
|
"loss": 1.0838, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8380053113012688, |
|
"grad_norm": 1.7299799919128418, |
|
"learning_rate": 4.050092720200638e-06, |
|
"loss": 1.1495, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8498082030097374, |
|
"grad_norm": 1.3773056268692017, |
|
"learning_rate": 3.5056229282080077e-06, |
|
"loss": 1.234, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.861611094718206, |
|
"grad_norm": 1.2820888757705688, |
|
"learning_rate": 2.997683428620296e-06, |
|
"loss": 1.1803, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8734139864266746, |
|
"grad_norm": 1.3301385641098022, |
|
"learning_rate": 2.527137478762037e-06, |
|
"loss": 1.2197, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8852168781351432, |
|
"grad_norm": 1.7628834247589111, |
|
"learning_rate": 2.094784784578707e-06, |
|
"loss": 1.2354, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8970197698436116, |
|
"grad_norm": 1.2032676935195923, |
|
"learning_rate": 1.7013601415141383e-06, |
|
"loss": 1.1835, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9088226615520802, |
|
"grad_norm": 1.5983058214187622, |
|
"learning_rate": 1.3475321857052386e-06, |
|
"loss": 1.1651, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9206255532605488, |
|
"grad_norm": 1.0227899551391602, |
|
"learning_rate": 1.03390225761624e-06, |
|
"loss": 1.1662, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9324284449690174, |
|
"grad_norm": 1.352665901184082, |
|
"learning_rate": 7.610033800438344e-07, |
|
"loss": 1.1798, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9442313366774859, |
|
"grad_norm": 1.6476454734802246, |
|
"learning_rate": 5.292993522301005e-07, |
|
"loss": 1.2053, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9560342283859545, |
|
"grad_norm": 1.2775633335113525, |
|
"learning_rate": 3.3918396162275214e-07, |
|
"loss": 1.2049, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9678371200944231, |
|
"grad_norm": 1.4991925954818726, |
|
"learning_rate": 1.9098031462242705e-07, |
|
"loss": 1.2097, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9796400118028917, |
|
"grad_norm": 1.3501712083816528, |
|
"learning_rate": 8.494028745434368e-08, |
|
"loss": 1.2085, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9914429035113603, |
|
"grad_norm": 1.319488763809204, |
|
"learning_rate": 2.124409809766692e-08, |
|
"loss": 1.1854, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9997049277072882, |
|
"step": 847, |
|
"total_flos": 1.2532647345436754e+18, |
|
"train_loss": 1.3799798170537847, |
|
"train_runtime": 10524.3823, |
|
"train_samples_per_second": 2.576, |
|
"train_steps_per_second": 0.08 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 847, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2532647345436754e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|