{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 55077, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02723459883435917, "grad_norm": 1.1466608047485352, "learning_rate": 4.9558799498883386e-05, "loss": 1.2147, "step": 500 }, { "epoch": 0.05446919766871834, "grad_norm": 3.79953932762146, "learning_rate": 4.910852079815531e-05, "loss": 0.4931, "step": 1000 }, { "epoch": 0.0817037965030775, "grad_norm": 2.2004730701446533, "learning_rate": 4.8654610817582656e-05, "loss": 0.508, "step": 1500 }, { "epoch": 0.10893839533743668, "grad_norm": 5.668378829956055, "learning_rate": 4.820070083701001e-05, "loss": 0.7482, "step": 2000 }, { "epoch": 0.13617299417179585, "grad_norm": 2.5953574180603027, "learning_rate": 4.774679085643735e-05, "loss": 0.6969, "step": 2500 }, { "epoch": 0.163407593006155, "grad_norm": 1.506287932395935, "learning_rate": 4.72928808758647e-05, "loss": 0.501, "step": 3000 }, { "epoch": 0.1906421918405142, "grad_norm": 1.627192735671997, "learning_rate": 4.683897089529205e-05, "loss": 0.5321, "step": 3500 }, { "epoch": 0.21787679067487337, "grad_norm": 1.1909410953521729, "learning_rate": 4.63850609147194e-05, "loss": 0.5486, "step": 4000 }, { "epoch": 0.24511138950923253, "grad_norm": 1.7642792463302612, "learning_rate": 4.5931150934146743e-05, "loss": 0.5043, "step": 4500 }, { "epoch": 0.2723459883435917, "grad_norm": 2.2761762142181396, "learning_rate": 4.547724095357409e-05, "loss": 0.4645, "step": 5000 }, { "epoch": 0.2995805871779509, "grad_norm": 2.3419032096862793, "learning_rate": 4.502333097300144e-05, "loss": 0.4197, "step": 5500 }, { "epoch": 0.32681518601231, "grad_norm": 3.043858051300049, "learning_rate": 4.4569420992428784e-05, "loss": 0.4128, "step": 6000 }, { "epoch": 0.3540497848466692, "grad_norm": 10.96549129486084, "learning_rate": 4.411551101185613e-05, "loss": 0.4257, "step": 6500 }, { "epoch": 0.3812843836810284, "grad_norm": 2.053966760635376, "learning_rate": 4.366160103128348e-05, "loss": 0.3825, "step": 7000 }, { "epoch": 0.40851898251538754, "grad_norm": 2.4897701740264893, "learning_rate": 4.3207691050710824e-05, "loss": 0.3699, "step": 7500 }, { "epoch": 0.43575358134974673, "grad_norm": 0.603682816028595, "learning_rate": 4.275378107013817e-05, "loss": 0.4291, "step": 8000 }, { "epoch": 0.46298818018410587, "grad_norm": 0.8895764350891113, "learning_rate": 4.229987108956552e-05, "loss": 0.3847, "step": 8500 }, { "epoch": 0.49022277901846506, "grad_norm": 0.43345028162002563, "learning_rate": 4.1845961108992865e-05, "loss": 0.3642, "step": 9000 }, { "epoch": 0.5174573778528242, "grad_norm": 1.6731306314468384, "learning_rate": 4.139205112842021e-05, "loss": 0.3751, "step": 9500 }, { "epoch": 0.5446919766871834, "grad_norm": 1.6484122276306152, "learning_rate": 4.093814114784756e-05, "loss": 0.4035, "step": 10000 }, { "epoch": 0.5719265755215426, "grad_norm": 1.7121918201446533, "learning_rate": 4.0484231167274905e-05, "loss": 0.3796, "step": 10500 }, { "epoch": 0.5991611743559018, "grad_norm": 2.6948976516723633, "learning_rate": 4.0030321186702256e-05, "loss": 0.372, "step": 11000 }, { "epoch": 0.626395773190261, "grad_norm": 3.9049389362335205, "learning_rate": 3.957641120612961e-05, "loss": 0.3455, "step": 11500 }, { "epoch": 0.65363037202462, "grad_norm": 0.8507063388824463, "learning_rate": 3.912250122555695e-05, "loss": 0.3432, "step": 12000 }, { "epoch": 0.6808649708589792, "grad_norm": 1.8186389207839966, "learning_rate": 3.8668591244984297e-05, "loss": 0.3413, "step": 12500 }, { "epoch": 0.7080995696933384, "grad_norm": 1.0689102411270142, "learning_rate": 3.821468126441165e-05, "loss": 0.3821, "step": 13000 }, { "epoch": 0.7353341685276976, "grad_norm": 1.7289353609085083, "learning_rate": 3.776077128383899e-05, "loss": 0.3861, "step": 13500 }, { "epoch": 0.7625687673620568, "grad_norm": 1.1911722421646118, "learning_rate": 3.730686130326634e-05, "loss": 0.3653, "step": 14000 }, { "epoch": 0.7898033661964159, "grad_norm": 6.017147541046143, "learning_rate": 3.685295132269369e-05, "loss": 0.3523, "step": 14500 }, { "epoch": 0.8170379650307751, "grad_norm": 0.9723203778266907, "learning_rate": 3.639904134212103e-05, "loss": 0.3366, "step": 15000 }, { "epoch": 0.8442725638651343, "grad_norm": 1.8334780931472778, "learning_rate": 3.594513136154838e-05, "loss": 0.3933, "step": 15500 }, { "epoch": 0.8715071626994935, "grad_norm": 1.174159049987793, "learning_rate": 3.549122138097573e-05, "loss": 0.3676, "step": 16000 }, { "epoch": 0.8987417615338527, "grad_norm": 0.1367327719926834, "learning_rate": 3.503731140040307e-05, "loss": 0.3286, "step": 16500 }, { "epoch": 0.9259763603682117, "grad_norm": 2.6567485332489014, "learning_rate": 3.458340141983042e-05, "loss": 0.3401, "step": 17000 }, { "epoch": 0.9532109592025709, "grad_norm": 0.11480577290058136, "learning_rate": 3.412949143925777e-05, "loss": 0.3858, "step": 17500 }, { "epoch": 0.9804455580369301, "grad_norm": 2.1067185401916504, "learning_rate": 3.3675581458685113e-05, "loss": 0.3208, "step": 18000 }, { "epoch": 1.0, "eval_runtime": 198.3127, "eval_samples_per_second": 9.48, "eval_steps_per_second": 9.48, "step": 18359 }, { "epoch": 1.0076801568712892, "grad_norm": 4.7463698387146, "learning_rate": 3.3221671478112465e-05, "loss": 0.3242, "step": 18500 }, { "epoch": 1.0349147557056484, "grad_norm": 0.8721242547035217, "learning_rate": 3.276776149753981e-05, "loss": 0.3311, "step": 19000 }, { "epoch": 1.0621493545400076, "grad_norm": 1.6243788003921509, "learning_rate": 3.231385151696716e-05, "loss": 0.2998, "step": 19500 }, { "epoch": 1.0893839533743668, "grad_norm": 0.8359081149101257, "learning_rate": 3.1859941536394505e-05, "loss": 0.3168, "step": 20000 }, { "epoch": 1.116618552208726, "grad_norm": 0.6658357381820679, "learning_rate": 3.140603155582185e-05, "loss": 0.331, "step": 20500 }, { "epoch": 1.1438531510430852, "grad_norm": 0.5795690417289734, "learning_rate": 3.09521215752492e-05, "loss": 0.3073, "step": 21000 }, { "epoch": 1.1710877498774444, "grad_norm": 0.18823903799057007, "learning_rate": 3.0498211594676546e-05, "loss": 0.2997, "step": 21500 }, { "epoch": 1.1983223487118035, "grad_norm": 0.7759385704994202, "learning_rate": 3.0044301614103893e-05, "loss": 0.3181, "step": 22000 }, { "epoch": 1.2255569475461627, "grad_norm": 2.6760952472686768, "learning_rate": 2.9590391633531238e-05, "loss": 0.3208, "step": 22500 }, { "epoch": 1.252791546380522, "grad_norm": 0.7384393215179443, "learning_rate": 2.9136481652958586e-05, "loss": 0.3236, "step": 23000 }, { "epoch": 1.280026145214881, "grad_norm": 0.1822945773601532, "learning_rate": 2.8682571672385934e-05, "loss": 0.2944, "step": 23500 }, { "epoch": 1.30726074404924, "grad_norm": 1.2044873237609863, "learning_rate": 2.822866169181328e-05, "loss": 0.3148, "step": 24000 }, { "epoch": 1.3344953428835993, "grad_norm": 0.12448325008153915, "learning_rate": 2.7774751711240626e-05, "loss": 0.32, "step": 24500 }, { "epoch": 1.3617299417179585, "grad_norm": 0.1313730776309967, "learning_rate": 2.7320841730667974e-05, "loss": 0.2869, "step": 25000 }, { "epoch": 1.3889645405523177, "grad_norm": 0.2766351103782654, "learning_rate": 2.686693175009532e-05, "loss": 0.3052, "step": 25500 }, { "epoch": 1.4161991393866769, "grad_norm": 1.1278197765350342, "learning_rate": 2.6413021769522673e-05, "loss": 0.2979, "step": 26000 }, { "epoch": 1.443433738221036, "grad_norm": 1.9573335647583008, "learning_rate": 2.5959111788950018e-05, "loss": 0.3226, "step": 26500 }, { "epoch": 1.4706683370553952, "grad_norm": 1.249816656112671, "learning_rate": 2.5505201808377366e-05, "loss": 0.3446, "step": 27000 }, { "epoch": 1.4979029358897544, "grad_norm": 1.5611047744750977, "learning_rate": 2.5051291827804714e-05, "loss": 0.3295, "step": 27500 }, { "epoch": 1.5251375347241134, "grad_norm": 0.2420412003993988, "learning_rate": 2.4597381847232058e-05, "loss": 0.3048, "step": 28000 }, { "epoch": 1.5523721335584728, "grad_norm": 0.621634304523468, "learning_rate": 2.4143471866659406e-05, "loss": 0.3135, "step": 28500 }, { "epoch": 1.5796067323928318, "grad_norm": 0.09876800328493118, "learning_rate": 2.3689561886086754e-05, "loss": 0.3231, "step": 29000 }, { "epoch": 1.606841331227191, "grad_norm": 0.10343176126480103, "learning_rate": 2.32356519055141e-05, "loss": 0.3441, "step": 29500 }, { "epoch": 1.6340759300615502, "grad_norm": 0.44668447971343994, "learning_rate": 2.2781741924941447e-05, "loss": 0.3297, "step": 30000 }, { "epoch": 1.6613105288959094, "grad_norm": 0.37340623140335083, "learning_rate": 2.2327831944368795e-05, "loss": 0.3026, "step": 30500 }, { "epoch": 1.6885451277302685, "grad_norm": 0.21011939644813538, "learning_rate": 2.187392196379614e-05, "loss": 0.3254, "step": 31000 }, { "epoch": 1.7157797265646277, "grad_norm": 1.6312121152877808, "learning_rate": 2.1420011983223487e-05, "loss": 0.3186, "step": 31500 }, { "epoch": 1.743014325398987, "grad_norm": 1.275604248046875, "learning_rate": 2.096700982261198e-05, "loss": 0.2971, "step": 32000 }, { "epoch": 1.770248924233346, "grad_norm": 1.6331900358200073, "learning_rate": 2.0513099842039328e-05, "loss": 0.3249, "step": 32500 }, { "epoch": 1.7974835230677053, "grad_norm": 1.0726169347763062, "learning_rate": 2.0059189861466676e-05, "loss": 0.3159, "step": 33000 }, { "epoch": 1.8247181219020643, "grad_norm": 2.8441109657287598, "learning_rate": 1.9606187700855166e-05, "loss": 0.3013, "step": 33500 }, { "epoch": 1.8519527207364237, "grad_norm": 1.9688265323638916, "learning_rate": 1.915318554024366e-05, "loss": 0.3048, "step": 34000 }, { "epoch": 1.8791873195707827, "grad_norm": 0.29343387484550476, "learning_rate": 1.8699275559671007e-05, "loss": 0.3037, "step": 34500 }, { "epoch": 1.9064219184051419, "grad_norm": 1.4208123683929443, "learning_rate": 1.8247181219020645e-05, "loss": 0.3486, "step": 35000 }, { "epoch": 1.933656517239501, "grad_norm": 0.09636660665273666, "learning_rate": 1.7793271238447993e-05, "loss": 0.3057, "step": 35500 }, { "epoch": 1.9608911160738602, "grad_norm": 2.4064226150512695, "learning_rate": 1.7339361257875337e-05, "loss": 0.2992, "step": 36000 }, { "epoch": 1.9881257149082194, "grad_norm": 0.09306484460830688, "learning_rate": 1.6885451277302685e-05, "loss": 0.2987, "step": 36500 }, { "epoch": 2.0, "eval_runtime": 197.752, "eval_samples_per_second": 9.507, "eval_steps_per_second": 9.507, "step": 36718 }, { "epoch": 2.0153603137425784, "grad_norm": 1.6785128116607666, "learning_rate": 1.6431541296730033e-05, "loss": 0.3208, "step": 37000 }, { "epoch": 2.042594912576938, "grad_norm": 3.7624003887176514, "learning_rate": 1.5978539136118526e-05, "loss": 0.2908, "step": 37500 }, { "epoch": 2.069829511411297, "grad_norm": 0.20085683465003967, "learning_rate": 1.552462915554587e-05, "loss": 0.2729, "step": 38000 }, { "epoch": 2.097064110245656, "grad_norm": 0.11236262321472168, "learning_rate": 1.5070719174973219e-05, "loss": 0.2664, "step": 38500 }, { "epoch": 2.124298709080015, "grad_norm": 2.0708374977111816, "learning_rate": 1.4616809194400567e-05, "loss": 0.2511, "step": 39000 }, { "epoch": 2.1515333079143746, "grad_norm": 1.7030911445617676, "learning_rate": 1.4162899213827916e-05, "loss": 0.2809, "step": 39500 }, { "epoch": 2.1787679067487336, "grad_norm": 0.11112015694379807, "learning_rate": 1.3708989233255262e-05, "loss": 0.2294, "step": 40000 }, { "epoch": 2.206002505583093, "grad_norm": 2.3932106494903564, "learning_rate": 1.3255079252682609e-05, "loss": 0.2881, "step": 40500 }, { "epoch": 2.233237104417452, "grad_norm": 0.9254179000854492, "learning_rate": 1.2801169272109957e-05, "loss": 0.2532, "step": 41000 }, { "epoch": 2.2604717032518113, "grad_norm": 0.17265941202640533, "learning_rate": 1.2347259291537303e-05, "loss": 0.2502, "step": 41500 }, { "epoch": 2.2877063020861703, "grad_norm": 2.3043088912963867, "learning_rate": 1.189334931096465e-05, "loss": 0.2799, "step": 42000 }, { "epoch": 2.3149409009205293, "grad_norm": 0.11663592606782913, "learning_rate": 1.1439439330391997e-05, "loss": 0.2569, "step": 42500 }, { "epoch": 2.3421754997548887, "grad_norm": 2.5327258110046387, "learning_rate": 1.0986437169780488e-05, "loss": 0.2735, "step": 43000 }, { "epoch": 2.3694100985892477, "grad_norm": 1.2668527364730835, "learning_rate": 1.0532527189207838e-05, "loss": 0.2692, "step": 43500 }, { "epoch": 2.396644697423607, "grad_norm": 1.1176379919052124, "learning_rate": 1.0078617208635184e-05, "loss": 0.2984, "step": 44000 }, { "epoch": 2.423879296257966, "grad_norm": 0.13128969073295593, "learning_rate": 9.62470722806253e-06, "loss": 0.2592, "step": 44500 }, { "epoch": 2.4511138950923255, "grad_norm": 0.8079116344451904, "learning_rate": 9.170797247489878e-06, "loss": 0.2869, "step": 45000 }, { "epoch": 2.4783484939266844, "grad_norm": 0.9324661493301392, "learning_rate": 8.716887266917226e-06, "loss": 0.2674, "step": 45500 }, { "epoch": 2.505583092761044, "grad_norm": 0.18096031248569489, "learning_rate": 8.262977286344572e-06, "loss": 0.2852, "step": 46000 }, { "epoch": 2.532817691595403, "grad_norm": 0.13841697573661804, "learning_rate": 7.811790765655356e-06, "loss": 0.2747, "step": 46500 }, { "epoch": 2.560052290429762, "grad_norm": 2.215595006942749, "learning_rate": 7.357880785082703e-06, "loss": 0.2836, "step": 47000 }, { "epoch": 2.587286889264121, "grad_norm": 0.17113931477069855, "learning_rate": 6.90397080451005e-06, "loss": 0.2791, "step": 47500 }, { "epoch": 2.61452148809848, "grad_norm": 1.888545274734497, "learning_rate": 6.450060823937397e-06, "loss": 0.2685, "step": 48000 }, { "epoch": 2.6417560869328396, "grad_norm": 0.15251892805099487, "learning_rate": 5.996150843364744e-06, "loss": 0.2958, "step": 48500 }, { "epoch": 2.6689906857671986, "grad_norm": 2.180168628692627, "learning_rate": 5.542240862792091e-06, "loss": 0.2887, "step": 49000 }, { "epoch": 2.696225284601558, "grad_norm": 1.863853931427002, "learning_rate": 5.088330882219438e-06, "loss": 0.2931, "step": 49500 }, { "epoch": 2.723459883435917, "grad_norm": 2.8054542541503906, "learning_rate": 4.634420901646785e-06, "loss": 0.2659, "step": 50000 }, { "epoch": 2.7506944822702764, "grad_norm": 3.5175323486328125, "learning_rate": 4.180510921074133e-06, "loss": 0.2574, "step": 50500 }, { "epoch": 2.7779290811046353, "grad_norm": 0.14439070224761963, "learning_rate": 3.727508760462625e-06, "loss": 0.2883, "step": 51000 }, { "epoch": 2.8051636799389943, "grad_norm": 0.41310277581214905, "learning_rate": 3.2735987798899726e-06, "loss": 0.2963, "step": 51500 }, { "epoch": 2.8323982787733537, "grad_norm": 0.3658026158809662, "learning_rate": 2.8196887993173193e-06, "loss": 0.2577, "step": 52000 }, { "epoch": 2.8596328776077127, "grad_norm": 0.11469651013612747, "learning_rate": 2.365778818744667e-06, "loss": 0.2784, "step": 52500 }, { "epoch": 2.886867476442072, "grad_norm": 0.16579371690750122, "learning_rate": 1.911868838172014e-06, "loss": 0.2777, "step": 53000 }, { "epoch": 2.914102075276431, "grad_norm": 3.678469657897949, "learning_rate": 1.457958857599361e-06, "loss": 0.2688, "step": 53500 }, { "epoch": 2.9413366741107905, "grad_norm": 0.30534350872039795, "learning_rate": 1.0040488770267082e-06, "loss": 0.2776, "step": 54000 }, { "epoch": 2.9685712729451494, "grad_norm": 0.42191004753112793, "learning_rate": 5.510467164152006e-07, "loss": 0.2729, "step": 54500 }, { "epoch": 2.995805871779509, "grad_norm": 1.7490407228469849, "learning_rate": 9.71367358425477e-08, "loss": 0.2829, "step": 55000 }, { "epoch": 3.0, "eval_runtime": 197.6032, "eval_samples_per_second": 9.514, "eval_steps_per_second": 9.514, "step": 55077 } ], "logging_steps": 500, "max_steps": 55077, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4605508716999475e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }