codeT5p-220m / trainer_state.json
fasterinnerlooper's picture
Upload folder using huggingface_hub
4845b9c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.1402364451691804,
"eval_steps": 500,
"global_step": 31500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 0.8877573609352112,
"learning_rate": 9.98641119717353e-06,
"loss": 2.7747,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 0.891560435295105,
"learning_rate": 9.97282239434706e-06,
"loss": 2.3805,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 1.140250563621521,
"learning_rate": 9.959233591520588e-06,
"loss": 2.2018,
"step": 300
},
{
"epoch": 0.03,
"grad_norm": 1.5957276821136475,
"learning_rate": 9.945644788694116e-06,
"loss": 2.1426,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 1.0516589879989624,
"learning_rate": 9.932055985867646e-06,
"loss": 2.0379,
"step": 500
},
{
"epoch": 0.03,
"eval_codebleu": 0.02236215080832533,
"eval_dataflow_match_score": 0.020328127538115238,
"eval_loss": 2.005807876586914,
"eval_ngram_match_score": 7.828601095920155e-05,
"eval_runtime": 211.5866,
"eval_samples_per_second": 12.019,
"eval_steps_per_second": 1.503,
"eval_syntax_match_score": 0.06800422172156935,
"eval_weighted_ngram_match_score": 0.0010379679626575443,
"step": 500
},
{
"epoch": 0.04,
"grad_norm": 1.4408249855041504,
"learning_rate": 9.918467183041175e-06,
"loss": 2.1005,
"step": 600
},
{
"epoch": 0.05,
"grad_norm": 1.042541742324829,
"learning_rate": 9.904878380214705e-06,
"loss": 2.0475,
"step": 700
},
{
"epoch": 0.05,
"grad_norm": 0.9206609129905701,
"learning_rate": 9.891289577388233e-06,
"loss": 2.0145,
"step": 800
},
{
"epoch": 0.06,
"grad_norm": 1.2876330614089966,
"learning_rate": 9.87770077456176e-06,
"loss": 1.9729,
"step": 900
},
{
"epoch": 0.07,
"grad_norm": 1.5496817827224731,
"learning_rate": 9.86411197173529e-06,
"loss": 1.9755,
"step": 1000
},
{
"epoch": 0.07,
"eval_codebleu": 0.026625421991617458,
"eval_dataflow_match_score": 0.017334601907502377,
"eval_loss": 1.8824141025543213,
"eval_ngram_match_score": 0.0001360979918249884,
"eval_runtime": 210.1093,
"eval_samples_per_second": 12.103,
"eval_steps_per_second": 1.513,
"eval_syntax_match_score": 0.085743571008139,
"eval_weighted_ngram_match_score": 0.0032874170590034683,
"step": 1000
},
{
"epoch": 0.07,
"grad_norm": 1.479344129562378,
"learning_rate": 9.85052316890882e-06,
"loss": 1.9526,
"step": 1100
},
{
"epoch": 0.08,
"grad_norm": 1.1165575981140137,
"learning_rate": 9.83693436608235e-06,
"loss": 1.9212,
"step": 1200
},
{
"epoch": 0.09,
"grad_norm": 1.871772289276123,
"learning_rate": 9.823345563255877e-06,
"loss": 1.8674,
"step": 1300
},
{
"epoch": 0.1,
"grad_norm": 1.1310107707977295,
"learning_rate": 9.809756760429407e-06,
"loss": 1.9083,
"step": 1400
},
{
"epoch": 0.1,
"grad_norm": 1.2779932022094727,
"learning_rate": 9.796167957602935e-06,
"loss": 1.8672,
"step": 1500
},
{
"epoch": 0.1,
"eval_codebleu": 0.03411263800674984,
"eval_dataflow_match_score": 0.035922307567354324,
"eval_loss": 1.8209562301635742,
"eval_ngram_match_score": 0.00019790870128407514,
"eval_runtime": 209.7736,
"eval_samples_per_second": 12.123,
"eval_steps_per_second": 1.516,
"eval_syntax_match_score": 0.09630802330065559,
"eval_weighted_ngram_match_score": 0.004022312457705368,
"step": 1500
},
{
"epoch": 0.11,
"grad_norm": 1.2445776462554932,
"learning_rate": 9.782579154776465e-06,
"loss": 1.8163,
"step": 1600
},
{
"epoch": 0.12,
"grad_norm": 1.673412561416626,
"learning_rate": 9.768990351949994e-06,
"loss": 1.8664,
"step": 1700
},
{
"epoch": 0.12,
"grad_norm": 1.3831863403320312,
"learning_rate": 9.755401549123524e-06,
"loss": 1.8336,
"step": 1800
},
{
"epoch": 0.13,
"grad_norm": 1.5055395364761353,
"learning_rate": 9.741812746297052e-06,
"loss": 1.879,
"step": 1900
},
{
"epoch": 0.14,
"grad_norm": 1.3736721277236938,
"learning_rate": 9.72822394347058e-06,
"loss": 1.8522,
"step": 2000
},
{
"epoch": 0.14,
"eval_codebleu": 0.04137599574546961,
"eval_dataflow_match_score": 0.06186619636599912,
"eval_loss": 1.7707544565200806,
"eval_ngram_match_score": 0.00026554637185762897,
"eval_runtime": 210.486,
"eval_samples_per_second": 12.082,
"eval_steps_per_second": 1.511,
"eval_syntax_match_score": 0.09913941829547991,
"eval_weighted_ngram_match_score": 0.00423282194854177,
"step": 2000
},
{
"epoch": 0.14,
"grad_norm": 1.5695489645004272,
"learning_rate": 9.71463514064411e-06,
"loss": 1.7906,
"step": 2100
},
{
"epoch": 0.15,
"grad_norm": 1.4151173830032349,
"learning_rate": 9.701046337817639e-06,
"loss": 1.8182,
"step": 2200
},
{
"epoch": 0.16,
"grad_norm": 1.7115099430084229,
"learning_rate": 9.687457534991169e-06,
"loss": 1.8232,
"step": 2300
},
{
"epoch": 0.16,
"grad_norm": 1.1658021211624146,
"learning_rate": 9.673868732164697e-06,
"loss": 1.7381,
"step": 2400
},
{
"epoch": 0.17,
"grad_norm": 1.5483379364013672,
"learning_rate": 9.660279929338226e-06,
"loss": 1.7707,
"step": 2500
},
{
"epoch": 0.17,
"eval_codebleu": 0.041373696094825684,
"eval_dataflow_match_score": 0.06151811199034646,
"eval_loss": 1.7324126958847046,
"eval_ngram_match_score": 0.00031767245966084565,
"eval_runtime": 210.1473,
"eval_samples_per_second": 12.101,
"eval_steps_per_second": 1.513,
"eval_syntax_match_score": 0.09862185146846902,
"eval_weighted_ngram_match_score": 0.0050371484608264,
"step": 2500
},
{
"epoch": 0.18,
"grad_norm": 1.138310194015503,
"learning_rate": 9.646691126511754e-06,
"loss": 1.702,
"step": 2600
},
{
"epoch": 0.18,
"grad_norm": 1.5409811735153198,
"learning_rate": 9.633102323685284e-06,
"loss": 1.7403,
"step": 2700
},
{
"epoch": 0.19,
"grad_norm": 1.2863227128982544,
"learning_rate": 9.619513520858813e-06,
"loss": 1.7514,
"step": 2800
},
{
"epoch": 0.2,
"grad_norm": 1.523219347000122,
"learning_rate": 9.605924718032343e-06,
"loss": 1.7735,
"step": 2900
},
{
"epoch": 0.2,
"grad_norm": 1.6315879821777344,
"learning_rate": 9.592335915205871e-06,
"loss": 1.7289,
"step": 3000
},
{
"epoch": 0.2,
"eval_codebleu": 0.03736357788677419,
"eval_dataflow_match_score": 0.04515814633467152,
"eval_loss": 1.6959866285324097,
"eval_ngram_match_score": 0.00026766267346779954,
"eval_runtime": 209.9032,
"eval_samples_per_second": 12.115,
"eval_steps_per_second": 1.515,
"eval_syntax_match_score": 0.09879437374413931,
"eval_weighted_ngram_match_score": 0.0052341287948181195,
"step": 3000
},
{
"epoch": 0.21,
"grad_norm": 1.5990880727767944,
"learning_rate": 9.5787471123794e-06,
"loss": 1.7275,
"step": 3100
},
{
"epoch": 0.22,
"grad_norm": 1.787031650543213,
"learning_rate": 9.565158309552928e-06,
"loss": 1.7112,
"step": 3200
},
{
"epoch": 0.22,
"grad_norm": 1.6794238090515137,
"learning_rate": 9.551569506726458e-06,
"loss": 1.683,
"step": 3300
},
{
"epoch": 0.23,
"grad_norm": 2.1808717250823975,
"learning_rate": 9.537980703899988e-06,
"loss": 1.8087,
"step": 3400
},
{
"epoch": 0.24,
"grad_norm": 1.737121343612671,
"learning_rate": 9.524391901073516e-06,
"loss": 1.7739,
"step": 3500
},
{
"epoch": 0.24,
"eval_codebleu": 0.03697189900391692,
"eval_dataflow_match_score": 0.03780196319587868,
"eval_loss": 1.668063998222351,
"eval_ngram_match_score": 0.0003274646611745997,
"eval_runtime": 208.872,
"eval_samples_per_second": 12.175,
"eval_steps_per_second": 1.522,
"eval_syntax_match_score": 0.1035438105096511,
"eval_weighted_ngram_match_score": 0.006214357648963285,
"step": 3500
},
{
"epoch": 0.24,
"grad_norm": 1.6818939447402954,
"learning_rate": 9.510803098247045e-06,
"loss": 1.7366,
"step": 3600
},
{
"epoch": 0.25,
"grad_norm": 1.4390071630477905,
"learning_rate": 9.497214295420573e-06,
"loss": 1.6693,
"step": 3700
},
{
"epoch": 0.26,
"grad_norm": 1.443108081817627,
"learning_rate": 9.483625492594103e-06,
"loss": 1.6856,
"step": 3800
},
{
"epoch": 0.26,
"grad_norm": 1.6540530920028687,
"learning_rate": 9.470036689767632e-06,
"loss": 1.6965,
"step": 3900
},
{
"epoch": 0.27,
"grad_norm": 1.178989291191101,
"learning_rate": 9.456447886941162e-06,
"loss": 1.6797,
"step": 4000
},
{
"epoch": 0.27,
"eval_codebleu": 0.033575238882649495,
"eval_dataflow_match_score": 0.025967094423688303,
"eval_loss": 1.6396487951278687,
"eval_ngram_match_score": 0.00023558729997467283,
"eval_runtime": 209.0765,
"eval_samples_per_second": 12.163,
"eval_steps_per_second": 1.521,
"eval_syntax_match_score": 0.1018591812295764,
"eval_weighted_ngram_match_score": 0.006239092577358598,
"step": 4000
},
{
"epoch": 0.28,
"grad_norm": 1.888967752456665,
"learning_rate": 9.44285908411469e-06,
"loss": 1.6815,
"step": 4100
},
{
"epoch": 0.29,
"grad_norm": 1.4612641334533691,
"learning_rate": 9.42927028128822e-06,
"loss": 1.6089,
"step": 4200
},
{
"epoch": 0.29,
"grad_norm": 1.5609626770019531,
"learning_rate": 9.415681478461748e-06,
"loss": 1.6824,
"step": 4300
},
{
"epoch": 0.3,
"grad_norm": 1.38056218624115,
"learning_rate": 9.402092675635277e-06,
"loss": 1.679,
"step": 4400
},
{
"epoch": 0.31,
"grad_norm": 1.928477168083191,
"learning_rate": 9.388503872808807e-06,
"loss": 1.6737,
"step": 4500
},
{
"epoch": 0.31,
"eval_codebleu": 0.03436485059379589,
"eval_dataflow_match_score": 0.025433365047687558,
"eval_loss": 1.619603157043457,
"eval_ngram_match_score": 0.00025042578946432695,
"eval_runtime": 210.4293,
"eval_samples_per_second": 12.085,
"eval_steps_per_second": 1.511,
"eval_syntax_match_score": 0.10487324686922811,
"eval_weighted_ngram_match_score": 0.006902364668803559,
"step": 4500
},
{
"epoch": 0.31,
"grad_norm": 1.467225432395935,
"learning_rate": 9.374915069982335e-06,
"loss": 1.642,
"step": 4600
},
{
"epoch": 0.32,
"grad_norm": 1.4231284856796265,
"learning_rate": 9.361326267155864e-06,
"loss": 1.6238,
"step": 4700
},
{
"epoch": 0.33,
"grad_norm": 1.61458158493042,
"learning_rate": 9.347737464329394e-06,
"loss": 1.6226,
"step": 4800
},
{
"epoch": 0.33,
"grad_norm": 1.8465476036071777,
"learning_rate": 9.334148661502922e-06,
"loss": 1.6427,
"step": 4900
},
{
"epoch": 0.34,
"grad_norm": 2.1492788791656494,
"learning_rate": 9.320559858676452e-06,
"loss": 1.6783,
"step": 5000
},
{
"epoch": 0.34,
"eval_codebleu": 0.03197730480981573,
"eval_dataflow_match_score": 0.018842967535330563,
"eval_loss": 1.596398115158081,
"eval_ngram_match_score": 0.00021050334214284166,
"eval_runtime": 209.9696,
"eval_samples_per_second": 12.111,
"eval_steps_per_second": 1.515,
"eval_syntax_match_score": 0.10182873612210518,
"eval_weighted_ngram_match_score": 0.0070270122396843505,
"step": 5000
},
{
"epoch": 0.35,
"grad_norm": 1.5661336183547974,
"learning_rate": 9.306971055849981e-06,
"loss": 1.6227,
"step": 5100
},
{
"epoch": 0.35,
"grad_norm": 1.5441151857376099,
"learning_rate": 9.293382253023509e-06,
"loss": 1.6525,
"step": 5200
},
{
"epoch": 0.36,
"grad_norm": 1.507628083229065,
"learning_rate": 9.279793450197039e-06,
"loss": 1.649,
"step": 5300
},
{
"epoch": 0.37,
"grad_norm": 1.5360280275344849,
"learning_rate": 9.266204647370567e-06,
"loss": 1.6738,
"step": 5400
},
{
"epoch": 0.37,
"grad_norm": 1.6375775337219238,
"learning_rate": 9.252615844544096e-06,
"loss": 1.5854,
"step": 5500
},
{
"epoch": 0.37,
"eval_loss": 1.5711854696273804,
"eval_runtime": 70.8487,
"eval_samples_per_second": 35.893,
"eval_steps_per_second": 4.488,
"step": 5500
},
{
"epoch": 0.38,
"grad_norm": 1.5210243463516235,
"learning_rate": 9.239027041717626e-06,
"loss": 1.621,
"step": 5600
},
{
"epoch": 0.39,
"grad_norm": 1.2972959280014038,
"learning_rate": 9.225438238891154e-06,
"loss": 1.6124,
"step": 5700
},
{
"epoch": 0.39,
"grad_norm": 1.902424693107605,
"learning_rate": 9.211849436064683e-06,
"loss": 1.6012,
"step": 5800
},
{
"epoch": 0.4,
"grad_norm": 1.865466594696045,
"learning_rate": 9.198260633238213e-06,
"loss": 1.5745,
"step": 5900
},
{
"epoch": 0.41,
"grad_norm": 1.6546335220336914,
"learning_rate": 9.184671830411741e-06,
"loss": 1.5874,
"step": 6000
},
{
"epoch": 0.41,
"eval_loss": 1.553696870803833,
"eval_runtime": 70.799,
"eval_samples_per_second": 35.919,
"eval_steps_per_second": 4.492,
"step": 6000
},
{
"epoch": 0.41,
"grad_norm": 1.7478700876235962,
"learning_rate": 9.17108302758527e-06,
"loss": 1.5666,
"step": 6100
},
{
"epoch": 0.42,
"grad_norm": 1.5560758113861084,
"learning_rate": 9.1574942247588e-06,
"loss": 1.5382,
"step": 6200
},
{
"epoch": 0.43,
"grad_norm": 1.4764758348464966,
"learning_rate": 9.143905421932328e-06,
"loss": 1.5408,
"step": 6300
},
{
"epoch": 0.43,
"grad_norm": 1.6634376049041748,
"learning_rate": 9.130316619105858e-06,
"loss": 1.5688,
"step": 6400
},
{
"epoch": 0.44,
"grad_norm": 1.673550009727478,
"learning_rate": 9.116727816279387e-06,
"loss": 1.5445,
"step": 6500
},
{
"epoch": 0.44,
"eval_loss": 1.5321872234344482,
"eval_runtime": 70.8079,
"eval_samples_per_second": 35.914,
"eval_steps_per_second": 4.491,
"step": 6500
},
{
"epoch": 0.45,
"grad_norm": 1.572766900062561,
"learning_rate": 9.103139013452915e-06,
"loss": 1.5596,
"step": 6600
},
{
"epoch": 0.46,
"grad_norm": 1.4650744199752808,
"learning_rate": 9.089550210626445e-06,
"loss": 1.5327,
"step": 6700
},
{
"epoch": 0.46,
"grad_norm": 1.8133198022842407,
"learning_rate": 9.075961407799973e-06,
"loss": 1.5789,
"step": 6800
},
{
"epoch": 0.47,
"grad_norm": 1.6164945363998413,
"learning_rate": 9.062372604973503e-06,
"loss": 1.4918,
"step": 6900
},
{
"epoch": 0.48,
"grad_norm": 1.7517341375350952,
"learning_rate": 9.048783802147032e-06,
"loss": 1.4947,
"step": 7000
},
{
"epoch": 0.48,
"eval_loss": 1.5142408609390259,
"eval_runtime": 70.7829,
"eval_samples_per_second": 35.927,
"eval_steps_per_second": 4.493,
"step": 7000
},
{
"epoch": 0.48,
"grad_norm": 2.3255422115325928,
"learning_rate": 9.03519499932056e-06,
"loss": 1.5434,
"step": 7100
},
{
"epoch": 0.49,
"grad_norm": 2.118253469467163,
"learning_rate": 9.02160619649409e-06,
"loss": 1.5191,
"step": 7200
},
{
"epoch": 0.5,
"grad_norm": 1.5948559045791626,
"learning_rate": 9.008017393667618e-06,
"loss": 1.5437,
"step": 7300
},
{
"epoch": 0.5,
"grad_norm": 1.4735013246536255,
"learning_rate": 8.994428590841147e-06,
"loss": 1.5217,
"step": 7400
},
{
"epoch": 0.51,
"grad_norm": 1.5940884351730347,
"learning_rate": 8.980839788014677e-06,
"loss": 1.5113,
"step": 7500
},
{
"epoch": 0.51,
"eval_loss": 1.4991660118103027,
"eval_runtime": 70.7857,
"eval_samples_per_second": 35.925,
"eval_steps_per_second": 4.492,
"step": 7500
},
{
"epoch": 0.52,
"grad_norm": 1.7738714218139648,
"learning_rate": 8.967250985188207e-06,
"loss": 1.5253,
"step": 7600
},
{
"epoch": 0.52,
"grad_norm": 1.8445935249328613,
"learning_rate": 8.953662182361734e-06,
"loss": 1.5359,
"step": 7700
},
{
"epoch": 0.53,
"grad_norm": 1.5108485221862793,
"learning_rate": 8.940073379535264e-06,
"loss": 1.5228,
"step": 7800
},
{
"epoch": 0.54,
"grad_norm": 1.7147557735443115,
"learning_rate": 8.926484576708792e-06,
"loss": 1.5228,
"step": 7900
},
{
"epoch": 0.54,
"grad_norm": 1.5899161100387573,
"learning_rate": 8.912895773882322e-06,
"loss": 1.4889,
"step": 8000
},
{
"epoch": 0.54,
"eval_loss": 1.4857795238494873,
"eval_runtime": 70.8945,
"eval_samples_per_second": 35.87,
"eval_steps_per_second": 4.486,
"step": 8000
},
{
"epoch": 0.55,
"grad_norm": 1.6886754035949707,
"learning_rate": 8.899306971055851e-06,
"loss": 1.4898,
"step": 8100
},
{
"epoch": 0.56,
"grad_norm": 1.5048600435256958,
"learning_rate": 8.885718168229381e-06,
"loss": 1.5239,
"step": 8200
},
{
"epoch": 0.56,
"grad_norm": 1.8773216009140015,
"learning_rate": 8.872129365402909e-06,
"loss": 1.4889,
"step": 8300
},
{
"epoch": 0.57,
"grad_norm": 1.7063783407211304,
"learning_rate": 8.858540562576437e-06,
"loss": 1.4472,
"step": 8400
},
{
"epoch": 0.58,
"grad_norm": 1.5942984819412231,
"learning_rate": 8.844951759749966e-06,
"loss": 1.4998,
"step": 8500
},
{
"epoch": 0.58,
"eval_loss": 1.4711333513259888,
"eval_runtime": 70.9176,
"eval_samples_per_second": 35.858,
"eval_steps_per_second": 4.484,
"step": 8500
},
{
"epoch": 0.58,
"grad_norm": 1.818272352218628,
"learning_rate": 8.831362956923496e-06,
"loss": 1.5504,
"step": 8600
},
{
"epoch": 0.59,
"grad_norm": 1.8230923414230347,
"learning_rate": 8.817774154097026e-06,
"loss": 1.5182,
"step": 8700
},
{
"epoch": 0.6,
"grad_norm": 1.8054814338684082,
"learning_rate": 8.804185351270554e-06,
"loss": 1.5136,
"step": 8800
},
{
"epoch": 0.6,
"grad_norm": 1.7768468856811523,
"learning_rate": 8.790596548444083e-06,
"loss": 1.4643,
"step": 8900
},
{
"epoch": 0.61,
"grad_norm": 1.9298087358474731,
"learning_rate": 8.777007745617611e-06,
"loss": 1.4449,
"step": 9000
},
{
"epoch": 0.61,
"eval_loss": 1.4553042650222778,
"eval_runtime": 71.0741,
"eval_samples_per_second": 35.78,
"eval_steps_per_second": 4.474,
"step": 9000
},
{
"epoch": 0.62,
"grad_norm": 1.979022741317749,
"learning_rate": 8.76341894279114e-06,
"loss": 1.5057,
"step": 9100
},
{
"epoch": 0.63,
"grad_norm": 1.6144100427627563,
"learning_rate": 8.74983013996467e-06,
"loss": 1.4893,
"step": 9200
},
{
"epoch": 0.63,
"grad_norm": 1.7385257482528687,
"learning_rate": 8.7362413371382e-06,
"loss": 1.4427,
"step": 9300
},
{
"epoch": 0.64,
"grad_norm": 2.218280792236328,
"learning_rate": 8.722652534311728e-06,
"loss": 1.445,
"step": 9400
},
{
"epoch": 0.65,
"grad_norm": 1.760903000831604,
"learning_rate": 8.709063731485256e-06,
"loss": 1.4364,
"step": 9500
},
{
"epoch": 0.65,
"eval_loss": 1.4402815103530884,
"eval_runtime": 70.7715,
"eval_samples_per_second": 35.933,
"eval_steps_per_second": 4.493,
"step": 9500
},
{
"epoch": 0.65,
"grad_norm": 1.4513877630233765,
"learning_rate": 8.695474928658785e-06,
"loss": 1.437,
"step": 9600
},
{
"epoch": 0.66,
"grad_norm": 1.703837513923645,
"learning_rate": 8.681886125832315e-06,
"loss": 1.4565,
"step": 9700
},
{
"epoch": 0.67,
"grad_norm": 2.192049980163574,
"learning_rate": 8.668297323005845e-06,
"loss": 1.4654,
"step": 9800
},
{
"epoch": 0.67,
"grad_norm": 2.012014150619507,
"learning_rate": 8.654708520179373e-06,
"loss": 1.4975,
"step": 9900
},
{
"epoch": 0.68,
"grad_norm": 2.117527484893799,
"learning_rate": 8.641119717352902e-06,
"loss": 1.4446,
"step": 10000
},
{
"epoch": 0.68,
"eval_loss": 1.4303960800170898,
"eval_runtime": 70.9752,
"eval_samples_per_second": 35.829,
"eval_steps_per_second": 4.48,
"step": 10000
},
{
"epoch": 0.69,
"grad_norm": 2.3108479976654053,
"learning_rate": 8.62753091452643e-06,
"loss": 1.4801,
"step": 10100
},
{
"epoch": 0.69,
"grad_norm": 1.4589275121688843,
"learning_rate": 8.61394211169996e-06,
"loss": 1.4579,
"step": 10200
},
{
"epoch": 0.7,
"grad_norm": 1.7688006162643433,
"learning_rate": 8.60035330887349e-06,
"loss": 1.471,
"step": 10300
},
{
"epoch": 0.71,
"grad_norm": 1.6766855716705322,
"learning_rate": 8.586764506047019e-06,
"loss": 1.4532,
"step": 10400
},
{
"epoch": 0.71,
"grad_norm": 2.0386102199554443,
"learning_rate": 8.573175703220547e-06,
"loss": 1.3998,
"step": 10500
},
{
"epoch": 0.71,
"eval_loss": 1.4204617738723755,
"eval_runtime": 70.7476,
"eval_samples_per_second": 35.945,
"eval_steps_per_second": 4.495,
"step": 10500
},
{
"epoch": 0.72,
"grad_norm": 1.9797570705413818,
"learning_rate": 8.559586900394075e-06,
"loss": 1.3922,
"step": 10600
},
{
"epoch": 0.73,
"grad_norm": 1.7562373876571655,
"learning_rate": 8.545998097567605e-06,
"loss": 1.4378,
"step": 10700
},
{
"epoch": 0.73,
"grad_norm": 1.6127831935882568,
"learning_rate": 8.532409294741134e-06,
"loss": 1.4483,
"step": 10800
},
{
"epoch": 0.74,
"grad_norm": 1.6120541095733643,
"learning_rate": 8.518820491914664e-06,
"loss": 1.3961,
"step": 10900
},
{
"epoch": 0.75,
"grad_norm": 1.5521306991577148,
"learning_rate": 8.505231689088192e-06,
"loss": 1.4101,
"step": 11000
},
{
"epoch": 0.75,
"eval_loss": 1.4052175283432007,
"eval_runtime": 70.9316,
"eval_samples_per_second": 35.851,
"eval_steps_per_second": 4.483,
"step": 11000
},
{
"epoch": 0.75,
"grad_norm": 2.4100379943847656,
"learning_rate": 8.491642886261721e-06,
"loss": 1.4224,
"step": 11100
},
{
"epoch": 0.76,
"grad_norm": 1.7542225122451782,
"learning_rate": 8.47805408343525e-06,
"loss": 1.4238,
"step": 11200
},
{
"epoch": 0.77,
"grad_norm": 2.3809213638305664,
"learning_rate": 8.464465280608779e-06,
"loss": 1.3968,
"step": 11300
},
{
"epoch": 0.77,
"grad_norm": 1.490343451499939,
"learning_rate": 8.450876477782309e-06,
"loss": 1.4512,
"step": 11400
},
{
"epoch": 0.78,
"grad_norm": 1.663609504699707,
"learning_rate": 8.437287674955838e-06,
"loss": 1.4772,
"step": 11500
},
{
"epoch": 0.78,
"eval_loss": 1.3936774730682373,
"eval_runtime": 70.914,
"eval_samples_per_second": 35.86,
"eval_steps_per_second": 4.484,
"step": 11500
},
{
"epoch": 0.79,
"grad_norm": 1.5684208869934082,
"learning_rate": 8.423698872129366e-06,
"loss": 1.4276,
"step": 11600
},
{
"epoch": 0.79,
"grad_norm": 1.6131608486175537,
"learning_rate": 8.410110069302894e-06,
"loss": 1.4067,
"step": 11700
},
{
"epoch": 0.8,
"grad_norm": 2.017564058303833,
"learning_rate": 8.396521266476424e-06,
"loss": 1.4028,
"step": 11800
},
{
"epoch": 0.81,
"grad_norm": 2.383514165878296,
"learning_rate": 8.382932463649953e-06,
"loss": 1.4028,
"step": 11900
},
{
"epoch": 0.82,
"grad_norm": 2.202026605606079,
"learning_rate": 8.369343660823483e-06,
"loss": 1.3671,
"step": 12000
},
{
"epoch": 0.82,
"eval_loss": 1.3839792013168335,
"eval_runtime": 70.7236,
"eval_samples_per_second": 35.957,
"eval_steps_per_second": 4.496,
"step": 12000
},
{
"epoch": 0.82,
"grad_norm": 1.499125361442566,
"learning_rate": 8.355754857997011e-06,
"loss": 1.4193,
"step": 12100
},
{
"epoch": 0.83,
"grad_norm": 1.3109521865844727,
"learning_rate": 8.34216605517054e-06,
"loss": 1.3969,
"step": 12200
},
{
"epoch": 0.84,
"grad_norm": 2.689412832260132,
"learning_rate": 8.328577252344068e-06,
"loss": 1.4141,
"step": 12300
},
{
"epoch": 0.84,
"grad_norm": 1.6615593433380127,
"learning_rate": 8.314988449517598e-06,
"loss": 1.3512,
"step": 12400
},
{
"epoch": 0.85,
"grad_norm": 1.994040846824646,
"learning_rate": 8.301399646691128e-06,
"loss": 1.4268,
"step": 12500
},
{
"epoch": 0.85,
"eval_loss": 1.3757482767105103,
"eval_runtime": 70.9944,
"eval_samples_per_second": 35.82,
"eval_steps_per_second": 4.479,
"step": 12500
},
{
"epoch": 0.86,
"grad_norm": 2.2422096729278564,
"learning_rate": 8.287810843864657e-06,
"loss": 1.3549,
"step": 12600
},
{
"epoch": 0.86,
"grad_norm": 1.4407843351364136,
"learning_rate": 8.274222041038185e-06,
"loss": 1.3934,
"step": 12700
},
{
"epoch": 0.87,
"grad_norm": 3.7289652824401855,
"learning_rate": 8.260633238211713e-06,
"loss": 1.3916,
"step": 12800
},
{
"epoch": 0.88,
"grad_norm": 1.819023847579956,
"learning_rate": 8.247044435385243e-06,
"loss": 1.3878,
"step": 12900
},
{
"epoch": 0.88,
"grad_norm": 1.6075499057769775,
"learning_rate": 8.233455632558772e-06,
"loss": 1.3469,
"step": 13000
},
{
"epoch": 0.88,
"eval_loss": 1.365922212600708,
"eval_runtime": 71.0057,
"eval_samples_per_second": 35.814,
"eval_steps_per_second": 4.479,
"step": 13000
},
{
"epoch": 0.89,
"grad_norm": 1.9204126596450806,
"learning_rate": 8.219866829732302e-06,
"loss": 1.3456,
"step": 13100
},
{
"epoch": 0.9,
"grad_norm": 2.0110292434692383,
"learning_rate": 8.20627802690583e-06,
"loss": 1.3921,
"step": 13200
},
{
"epoch": 0.9,
"grad_norm": 1.4502629041671753,
"learning_rate": 8.192689224079358e-06,
"loss": 1.383,
"step": 13300
},
{
"epoch": 0.91,
"grad_norm": 2.5011653900146484,
"learning_rate": 8.179100421252888e-06,
"loss": 1.3413,
"step": 13400
},
{
"epoch": 0.92,
"grad_norm": 1.4338220357894897,
"learning_rate": 8.165511618426417e-06,
"loss": 1.3531,
"step": 13500
},
{
"epoch": 0.92,
"eval_loss": 1.3567384481430054,
"eval_runtime": 70.8084,
"eval_samples_per_second": 35.914,
"eval_steps_per_second": 4.491,
"step": 13500
},
{
"epoch": 0.92,
"grad_norm": 1.8867733478546143,
"learning_rate": 8.151922815599947e-06,
"loss": 1.4115,
"step": 13600
},
{
"epoch": 0.93,
"grad_norm": 1.897558331489563,
"learning_rate": 8.138334012773476e-06,
"loss": 1.3473,
"step": 13700
},
{
"epoch": 0.94,
"grad_norm": 2.6677191257476807,
"learning_rate": 8.124745209947004e-06,
"loss": 1.3982,
"step": 13800
},
{
"epoch": 0.94,
"grad_norm": 1.6690632104873657,
"learning_rate": 8.111156407120532e-06,
"loss": 1.3371,
"step": 13900
},
{
"epoch": 0.95,
"grad_norm": 1.668286919593811,
"learning_rate": 8.097567604294062e-06,
"loss": 1.3463,
"step": 14000
},
{
"epoch": 0.95,
"eval_loss": 1.3470206260681152,
"eval_runtime": 70.8694,
"eval_samples_per_second": 35.883,
"eval_steps_per_second": 4.487,
"step": 14000
},
{
"epoch": 0.96,
"grad_norm": 1.3303313255310059,
"learning_rate": 8.083978801467592e-06,
"loss": 1.3283,
"step": 14100
},
{
"epoch": 0.96,
"grad_norm": 1.8314011096954346,
"learning_rate": 8.070389998641121e-06,
"loss": 1.3382,
"step": 14200
},
{
"epoch": 0.97,
"grad_norm": 1.6911287307739258,
"learning_rate": 8.056801195814649e-06,
"loss": 1.3249,
"step": 14300
},
{
"epoch": 0.98,
"grad_norm": 2.0255990028381348,
"learning_rate": 8.043212392988179e-06,
"loss": 1.4345,
"step": 14400
},
{
"epoch": 0.99,
"grad_norm": 1.6872771978378296,
"learning_rate": 8.029623590161707e-06,
"loss": 1.3662,
"step": 14500
},
{
"epoch": 0.99,
"eval_loss": 1.3394687175750732,
"eval_runtime": 70.8165,
"eval_samples_per_second": 35.91,
"eval_steps_per_second": 4.49,
"step": 14500
},
{
"epoch": 0.99,
"grad_norm": 1.2456538677215576,
"learning_rate": 8.016034787335236e-06,
"loss": 1.3152,
"step": 14600
},
{
"epoch": 1.0,
"grad_norm": 1.9343585968017578,
"learning_rate": 8.002445984508766e-06,
"loss": 1.3179,
"step": 14700
},
{
"epoch": 1.01,
"grad_norm": 1.6026442050933838,
"learning_rate": 7.988857181682294e-06,
"loss": 1.3445,
"step": 14800
},
{
"epoch": 1.01,
"grad_norm": 1.8159044981002808,
"learning_rate": 7.975268378855823e-06,
"loss": 1.3259,
"step": 14900
},
{
"epoch": 1.02,
"grad_norm": 1.6430504322052002,
"learning_rate": 7.961679576029351e-06,
"loss": 1.337,
"step": 15000
},
{
"epoch": 1.02,
"eval_loss": 1.3323568105697632,
"eval_runtime": 70.9018,
"eval_samples_per_second": 35.867,
"eval_steps_per_second": 4.485,
"step": 15000
},
{
"epoch": 1.03,
"grad_norm": 2.036970853805542,
"learning_rate": 7.948090773202881e-06,
"loss": 1.3217,
"step": 15100
},
{
"epoch": 1.03,
"grad_norm": 1.6756584644317627,
"learning_rate": 7.93450197037641e-06,
"loss": 1.3008,
"step": 15200
},
{
"epoch": 1.04,
"grad_norm": 1.5923326015472412,
"learning_rate": 7.92091316754994e-06,
"loss": 1.3347,
"step": 15300
},
{
"epoch": 1.05,
"grad_norm": 1.809383749961853,
"learning_rate": 7.907460252751734e-06,
"loss": 1.3192,
"step": 15400
},
{
"epoch": 1.05,
"grad_norm": 2.035680055618286,
"learning_rate": 7.893871449925262e-06,
"loss": 1.3627,
"step": 15500
},
{
"epoch": 1.05,
"eval_loss": 1.3219527006149292,
"eval_runtime": 70.9632,
"eval_samples_per_second": 35.835,
"eval_steps_per_second": 4.481,
"step": 15500
},
{
"epoch": 1.06,
"grad_norm": 1.7485100030899048,
"learning_rate": 7.880282647098791e-06,
"loss": 1.3015,
"step": 15600
},
{
"epoch": 1.07,
"grad_norm": 2.0771241188049316,
"learning_rate": 7.86669384427232e-06,
"loss": 1.3261,
"step": 15700
},
{
"epoch": 1.07,
"grad_norm": 1.8625783920288086,
"learning_rate": 7.853105041445849e-06,
"loss": 1.3308,
"step": 15800
},
{
"epoch": 1.08,
"grad_norm": 1.8347725868225098,
"learning_rate": 7.839516238619378e-06,
"loss": 1.3637,
"step": 15900
},
{
"epoch": 1.09,
"grad_norm": 1.9449338912963867,
"learning_rate": 7.825927435792908e-06,
"loss": 1.2906,
"step": 16000
},
{
"epoch": 1.09,
"eval_loss": 1.3169829845428467,
"eval_runtime": 70.9704,
"eval_samples_per_second": 35.832,
"eval_steps_per_second": 4.481,
"step": 16000
},
{
"epoch": 1.09,
"grad_norm": 1.6746830940246582,
"learning_rate": 7.812338632966436e-06,
"loss": 1.3326,
"step": 16100
},
{
"epoch": 1.1,
"grad_norm": 1.5581905841827393,
"learning_rate": 7.798749830139966e-06,
"loss": 1.2964,
"step": 16200
},
{
"epoch": 1.11,
"grad_norm": 2.1636734008789062,
"learning_rate": 7.785161027313495e-06,
"loss": 1.2867,
"step": 16300
},
{
"epoch": 1.11,
"grad_norm": 1.760335922241211,
"learning_rate": 7.771572224487023e-06,
"loss": 1.3017,
"step": 16400
},
{
"epoch": 1.12,
"grad_norm": 1.9209500551223755,
"learning_rate": 7.757983421660553e-06,
"loss": 1.331,
"step": 16500
},
{
"epoch": 1.12,
"eval_loss": 1.3085339069366455,
"eval_runtime": 70.9762,
"eval_samples_per_second": 35.829,
"eval_steps_per_second": 4.48,
"step": 16500
},
{
"epoch": 1.13,
"grad_norm": 1.8936728239059448,
"learning_rate": 7.74439461883408e-06,
"loss": 1.3494,
"step": 16600
},
{
"epoch": 1.13,
"grad_norm": 1.6603670120239258,
"learning_rate": 7.73080581600761e-06,
"loss": 1.3443,
"step": 16700
},
{
"epoch": 1.14,
"grad_norm": 1.9962695837020874,
"learning_rate": 7.71721701318114e-06,
"loss": 1.3016,
"step": 16800
},
{
"epoch": 1.15,
"grad_norm": 1.7902451753616333,
"learning_rate": 7.70362821035467e-06,
"loss": 1.3107,
"step": 16900
},
{
"epoch": 1.16,
"grad_norm": 1.7962889671325684,
"learning_rate": 7.690039407528197e-06,
"loss": 1.3082,
"step": 17000
},
{
"epoch": 1.16,
"eval_loss": 1.3007583618164062,
"eval_runtime": 70.8534,
"eval_samples_per_second": 35.891,
"eval_steps_per_second": 4.488,
"step": 17000
},
{
"epoch": 1.16,
"grad_norm": 1.8448220491409302,
"learning_rate": 7.676450604701725e-06,
"loss": 1.3148,
"step": 17100
},
{
"epoch": 1.17,
"grad_norm": 2.124708652496338,
"learning_rate": 7.662861801875255e-06,
"loss": 1.3348,
"step": 17200
},
{
"epoch": 1.18,
"grad_norm": 1.5953021049499512,
"learning_rate": 7.649272999048785e-06,
"loss": 1.2575,
"step": 17300
},
{
"epoch": 1.18,
"grad_norm": 1.7431753873825073,
"learning_rate": 7.635684196222314e-06,
"loss": 1.3239,
"step": 17400
},
{
"epoch": 1.19,
"grad_norm": 2.0628209114074707,
"learning_rate": 7.622095393395842e-06,
"loss": 1.2904,
"step": 17500
},
{
"epoch": 1.19,
"eval_loss": 1.2971383333206177,
"eval_runtime": 70.8465,
"eval_samples_per_second": 35.894,
"eval_steps_per_second": 4.489,
"step": 17500
},
{
"epoch": 1.2,
"grad_norm": 2.154141902923584,
"learning_rate": 7.608506590569371e-06,
"loss": 1.2824,
"step": 17600
},
{
"epoch": 1.2,
"grad_norm": 1.7325314283370972,
"learning_rate": 7.595053675771166e-06,
"loss": 1.2587,
"step": 17700
},
{
"epoch": 1.21,
"grad_norm": 1.7533044815063477,
"learning_rate": 7.581464872944694e-06,
"loss": 1.2697,
"step": 17800
},
{
"epoch": 1.22,
"grad_norm": 1.642408847808838,
"learning_rate": 7.567876070118223e-06,
"loss": 1.2587,
"step": 17900
},
{
"epoch": 1.22,
"grad_norm": 2.0030946731567383,
"learning_rate": 7.554287267291752e-06,
"loss": 1.2825,
"step": 18000
},
{
"epoch": 1.22,
"eval_loss": 1.2882283926010132,
"eval_runtime": 70.9711,
"eval_samples_per_second": 35.831,
"eval_steps_per_second": 4.481,
"step": 18000
},
{
"epoch": 1.23,
"grad_norm": 1.828447699546814,
"learning_rate": 7.540698464465281e-06,
"loss": 1.2162,
"step": 18100
},
{
"epoch": 1.24,
"grad_norm": 1.9078677892684937,
"learning_rate": 7.527109661638811e-06,
"loss": 1.2618,
"step": 18200
},
{
"epoch": 1.24,
"grad_norm": 1.7438205480575562,
"learning_rate": 7.513520858812339e-06,
"loss": 1.2972,
"step": 18300
},
{
"epoch": 1.25,
"grad_norm": 1.5308886766433716,
"learning_rate": 7.499932055985868e-06,
"loss": 1.2635,
"step": 18400
},
{
"epoch": 1.26,
"grad_norm": 1.7570804357528687,
"learning_rate": 7.486343253159397e-06,
"loss": 1.3104,
"step": 18500
},
{
"epoch": 1.26,
"eval_loss": 1.2821784019470215,
"eval_runtime": 70.9616,
"eval_samples_per_second": 35.836,
"eval_steps_per_second": 4.481,
"step": 18500
},
{
"epoch": 1.26,
"grad_norm": 1.820691466331482,
"learning_rate": 7.472754450332927e-06,
"loss": 1.2978,
"step": 18600
},
{
"epoch": 1.27,
"grad_norm": 2.0996739864349365,
"learning_rate": 7.4591656475064555e-06,
"loss": 1.2925,
"step": 18700
},
{
"epoch": 1.28,
"grad_norm": 1.6602342128753662,
"learning_rate": 7.445576844679985e-06,
"loss": 1.243,
"step": 18800
},
{
"epoch": 1.28,
"grad_norm": 2.034649133682251,
"learning_rate": 7.431988041853513e-06,
"loss": 1.2775,
"step": 18900
},
{
"epoch": 1.29,
"grad_norm": 1.898582100868225,
"learning_rate": 7.418399239027042e-06,
"loss": 1.2786,
"step": 19000
},
{
"epoch": 1.29,
"eval_loss": 1.2745426893234253,
"eval_runtime": 70.9759,
"eval_samples_per_second": 35.829,
"eval_steps_per_second": 4.48,
"step": 19000
},
{
"epoch": 1.3,
"grad_norm": 1.4128785133361816,
"learning_rate": 7.4048104362005715e-06,
"loss": 1.2656,
"step": 19100
},
{
"epoch": 1.3,
"grad_norm": 2.2971534729003906,
"learning_rate": 7.3912216333741e-06,
"loss": 1.2426,
"step": 19200
},
{
"epoch": 1.31,
"grad_norm": 1.783996820449829,
"learning_rate": 7.37763283054763e-06,
"loss": 1.272,
"step": 19300
},
{
"epoch": 1.32,
"grad_norm": 1.8958848714828491,
"learning_rate": 7.364044027721159e-06,
"loss": 1.2168,
"step": 19400
},
{
"epoch": 1.32,
"grad_norm": 2.1363043785095215,
"learning_rate": 7.350455224894687e-06,
"loss": 1.2734,
"step": 19500
},
{
"epoch": 1.32,
"eval_loss": 1.2699941396713257,
"eval_runtime": 71.0334,
"eval_samples_per_second": 35.8,
"eval_steps_per_second": 4.477,
"step": 19500
},
{
"epoch": 1.33,
"grad_norm": 2.642695903778076,
"learning_rate": 7.336866422068216e-06,
"loss": 1.3135,
"step": 19600
},
{
"epoch": 1.34,
"grad_norm": 1.8240422010421753,
"learning_rate": 7.323277619241746e-06,
"loss": 1.2924,
"step": 19700
},
{
"epoch": 1.35,
"grad_norm": 1.8623692989349365,
"learning_rate": 7.309688816415275e-06,
"loss": 1.2907,
"step": 19800
},
{
"epoch": 1.35,
"grad_norm": 2.2778708934783936,
"learning_rate": 7.296100013588804e-06,
"loss": 1.2727,
"step": 19900
},
{
"epoch": 1.36,
"grad_norm": 1.8957061767578125,
"learning_rate": 7.282511210762332e-06,
"loss": 1.2656,
"step": 20000
},
{
"epoch": 1.36,
"eval_loss": 1.2644336223602295,
"eval_runtime": 70.9373,
"eval_samples_per_second": 35.849,
"eval_steps_per_second": 4.483,
"step": 20000
},
{
"epoch": 1.37,
"grad_norm": 1.7855497598648071,
"learning_rate": 7.268922407935861e-06,
"loss": 1.2158,
"step": 20100
},
{
"epoch": 1.37,
"grad_norm": 1.8924943208694458,
"learning_rate": 7.2553336051093905e-06,
"loss": 1.2753,
"step": 20200
},
{
"epoch": 1.38,
"grad_norm": 2.0177762508392334,
"learning_rate": 7.241880690311184e-06,
"loss": 1.2406,
"step": 20300
},
{
"epoch": 1.39,
"grad_norm": 2.4161458015441895,
"learning_rate": 7.228291887484713e-06,
"loss": 1.2453,
"step": 20400
},
{
"epoch": 1.39,
"grad_norm": 1.7210235595703125,
"learning_rate": 7.214703084658242e-06,
"loss": 1.2107,
"step": 20500
},
{
"epoch": 1.39,
"eval_loss": 1.258453607559204,
"eval_runtime": 71.0477,
"eval_samples_per_second": 35.793,
"eval_steps_per_second": 4.476,
"step": 20500
},
{
"epoch": 1.4,
"grad_norm": 1.5457173585891724,
"learning_rate": 7.201114281831771e-06,
"loss": 1.2377,
"step": 20600
},
{
"epoch": 1.41,
"grad_norm": 2.403831720352173,
"learning_rate": 7.187525479005301e-06,
"loss": 1.238,
"step": 20700
},
{
"epoch": 1.41,
"grad_norm": 2.01042103767395,
"learning_rate": 7.173936676178829e-06,
"loss": 1.2528,
"step": 20800
},
{
"epoch": 1.42,
"grad_norm": 2.197006940841675,
"learning_rate": 7.160347873352358e-06,
"loss": 1.2395,
"step": 20900
},
{
"epoch": 1.43,
"grad_norm": 2.503634214401245,
"learning_rate": 7.146759070525887e-06,
"loss": 1.2822,
"step": 21000
},
{
"epoch": 1.43,
"eval_loss": 1.2508896589279175,
"eval_runtime": 71.0512,
"eval_samples_per_second": 35.791,
"eval_steps_per_second": 4.476,
"step": 21000
},
{
"epoch": 1.43,
"grad_norm": 1.4275486469268799,
"learning_rate": 7.133170267699417e-06,
"loss": 1.2337,
"step": 21100
},
{
"epoch": 1.44,
"grad_norm": 2.1461949348449707,
"learning_rate": 7.11971735290121e-06,
"loss": 1.2576,
"step": 21200
},
{
"epoch": 1.45,
"grad_norm": 1.705665111541748,
"learning_rate": 7.106128550074739e-06,
"loss": 1.2311,
"step": 21300
},
{
"epoch": 1.45,
"grad_norm": 2.058223247528076,
"learning_rate": 7.0925397472482685e-06,
"loss": 1.2619,
"step": 21400
},
{
"epoch": 1.46,
"grad_norm": 1.4659618139266968,
"learning_rate": 7.078950944421797e-06,
"loss": 1.2188,
"step": 21500
},
{
"epoch": 1.46,
"eval_loss": 1.2478315830230713,
"eval_runtime": 70.8088,
"eval_samples_per_second": 35.914,
"eval_steps_per_second": 4.491,
"step": 21500
},
{
"epoch": 1.47,
"grad_norm": 1.7102398872375488,
"learning_rate": 7.065362141595325e-06,
"loss": 1.2553,
"step": 21600
},
{
"epoch": 1.47,
"grad_norm": 2.445326089859009,
"learning_rate": 7.051773338768855e-06,
"loss": 1.2687,
"step": 21700
},
{
"epoch": 1.48,
"grad_norm": 2.055088758468628,
"learning_rate": 7.038184535942384e-06,
"loss": 1.254,
"step": 21800
},
{
"epoch": 1.49,
"grad_norm": 2.5781538486480713,
"learning_rate": 7.024595733115913e-06,
"loss": 1.2319,
"step": 21900
},
{
"epoch": 1.49,
"grad_norm": 1.9507685899734497,
"learning_rate": 7.011006930289442e-06,
"loss": 1.2185,
"step": 22000
},
{
"epoch": 1.49,
"eval_loss": 1.245086908340454,
"eval_runtime": 71.042,
"eval_samples_per_second": 35.796,
"eval_steps_per_second": 4.476,
"step": 22000
},
{
"epoch": 1.5,
"grad_norm": 2.347245216369629,
"learning_rate": 6.997418127462972e-06,
"loss": 1.2024,
"step": 22100
},
{
"epoch": 1.51,
"grad_norm": 1.6178568601608276,
"learning_rate": 6.9838293246364995e-06,
"loss": 1.2379,
"step": 22200
},
{
"epoch": 1.52,
"grad_norm": 1.7551498413085938,
"learning_rate": 6.970240521810029e-06,
"loss": 1.2344,
"step": 22300
},
{
"epoch": 1.52,
"grad_norm": 1.9250737428665161,
"learning_rate": 6.956651718983558e-06,
"loss": 1.2942,
"step": 22400
},
{
"epoch": 1.53,
"grad_norm": 1.640271782875061,
"learning_rate": 6.9430629161570876e-06,
"loss": 1.2441,
"step": 22500
},
{
"epoch": 1.53,
"eval_loss": 1.2352341413497925,
"eval_runtime": 70.9437,
"eval_samples_per_second": 35.845,
"eval_steps_per_second": 4.482,
"step": 22500
},
{
"epoch": 1.54,
"grad_norm": 2.083061456680298,
"learning_rate": 6.929474113330616e-06,
"loss": 1.1819,
"step": 22600
},
{
"epoch": 1.54,
"grad_norm": 1.8274168968200684,
"learning_rate": 6.915885310504146e-06,
"loss": 1.2243,
"step": 22700
},
{
"epoch": 1.55,
"grad_norm": 1.7711529731750488,
"learning_rate": 6.902296507677674e-06,
"loss": 1.2318,
"step": 22800
},
{
"epoch": 1.56,
"grad_norm": 2.0537028312683105,
"learning_rate": 6.888707704851203e-06,
"loss": 1.1958,
"step": 22900
},
{
"epoch": 1.56,
"grad_norm": 1.8466728925704956,
"learning_rate": 6.875118902024732e-06,
"loss": 1.2564,
"step": 23000
},
{
"epoch": 1.56,
"eval_loss": 1.2326687574386597,
"eval_runtime": 70.9545,
"eval_samples_per_second": 35.84,
"eval_steps_per_second": 4.482,
"step": 23000
},
{
"epoch": 1.57,
"grad_norm": 1.6301406621932983,
"learning_rate": 6.861530099198261e-06,
"loss": 1.1904,
"step": 23100
},
{
"epoch": 1.58,
"grad_norm": 2.1303887367248535,
"learning_rate": 6.847941296371791e-06,
"loss": 1.2,
"step": 23200
},
{
"epoch": 1.58,
"grad_norm": 2.042210340499878,
"learning_rate": 6.834352493545319e-06,
"loss": 1.2432,
"step": 23300
},
{
"epoch": 1.59,
"grad_norm": 1.8403574228286743,
"learning_rate": 6.820763690718848e-06,
"loss": 1.2195,
"step": 23400
},
{
"epoch": 1.6,
"grad_norm": 1.8628817796707153,
"learning_rate": 6.807174887892377e-06,
"loss": 1.2032,
"step": 23500
},
{
"epoch": 1.6,
"eval_loss": 1.2271267175674438,
"eval_runtime": 70.9472,
"eval_samples_per_second": 35.844,
"eval_steps_per_second": 4.482,
"step": 23500
},
{
"epoch": 1.6,
"grad_norm": 2.2309892177581787,
"learning_rate": 6.793586085065907e-06,
"loss": 1.1931,
"step": 23600
},
{
"epoch": 1.61,
"grad_norm": 1.4337612390518188,
"learning_rate": 6.7799972822394354e-06,
"loss": 1.2401,
"step": 23700
},
{
"epoch": 1.62,
"grad_norm": 1.7968145608901978,
"learning_rate": 6.766408479412965e-06,
"loss": 1.233,
"step": 23800
},
{
"epoch": 1.62,
"grad_norm": 1.7918980121612549,
"learning_rate": 6.7529555646147584e-06,
"loss": 1.1874,
"step": 23900
},
{
"epoch": 1.63,
"grad_norm": 1.9370090961456299,
"learning_rate": 6.739366761788287e-06,
"loss": 1.2031,
"step": 24000
},
{
"epoch": 1.63,
"eval_loss": 1.2228479385375977,
"eval_runtime": 70.9391,
"eval_samples_per_second": 35.848,
"eval_steps_per_second": 4.483,
"step": 24000
},
{
"epoch": 1.64,
"grad_norm": 2.238128900527954,
"learning_rate": 6.725777958961815e-06,
"loss": 1.2207,
"step": 24100
},
{
"epoch": 1.64,
"grad_norm": 1.9183790683746338,
"learning_rate": 6.712189156135345e-06,
"loss": 1.2263,
"step": 24200
},
{
"epoch": 1.65,
"grad_norm": 2.407428026199341,
"learning_rate": 6.6986003533088736e-06,
"loss": 1.2424,
"step": 24300
},
{
"epoch": 1.66,
"grad_norm": 1.837365746498108,
"learning_rate": 6.685011550482403e-06,
"loss": 1.1832,
"step": 24400
},
{
"epoch": 1.66,
"grad_norm": 1.9926724433898926,
"learning_rate": 6.671422747655932e-06,
"loss": 1.2088,
"step": 24500
},
{
"epoch": 1.66,
"eval_loss": 1.2178888320922852,
"eval_runtime": 71.0012,
"eval_samples_per_second": 35.816,
"eval_steps_per_second": 4.479,
"step": 24500
},
{
"epoch": 1.67,
"grad_norm": 1.6491261720657349,
"learning_rate": 6.657833944829462e-06,
"loss": 1.2103,
"step": 24600
},
{
"epoch": 1.68,
"grad_norm": 1.825826644897461,
"learning_rate": 6.6442451420029895e-06,
"loss": 1.1911,
"step": 24700
},
{
"epoch": 1.69,
"grad_norm": 1.788091778755188,
"learning_rate": 6.630656339176519e-06,
"loss": 1.2246,
"step": 24800
},
{
"epoch": 1.69,
"grad_norm": 1.8941233158111572,
"learning_rate": 6.617067536350048e-06,
"loss": 1.197,
"step": 24900
},
{
"epoch": 1.7,
"grad_norm": 2.360272169113159,
"learning_rate": 6.6034787335235775e-06,
"loss": 1.1925,
"step": 25000
},
{
"epoch": 1.7,
"eval_loss": 1.2120610475540161,
"eval_runtime": 70.9625,
"eval_samples_per_second": 35.836,
"eval_steps_per_second": 4.481,
"step": 25000
},
{
"epoch": 1.71,
"grad_norm": 2.0026679039001465,
"learning_rate": 6.589889930697106e-06,
"loss": 1.2063,
"step": 25100
},
{
"epoch": 1.71,
"grad_norm": 1.9979290962219238,
"learning_rate": 6.576301127870636e-06,
"loss": 1.1784,
"step": 25200
},
{
"epoch": 1.72,
"grad_norm": 1.682900071144104,
"learning_rate": 6.562712325044164e-06,
"loss": 1.1587,
"step": 25300
},
{
"epoch": 1.73,
"grad_norm": 2.0586678981781006,
"learning_rate": 6.549123522217693e-06,
"loss": 1.2031,
"step": 25400
},
{
"epoch": 1.73,
"grad_norm": 2.5424463748931885,
"learning_rate": 6.535534719391222e-06,
"loss": 1.2061,
"step": 25500
},
{
"epoch": 1.73,
"eval_loss": 1.209425449371338,
"eval_runtime": 70.8814,
"eval_samples_per_second": 35.877,
"eval_steps_per_second": 4.486,
"step": 25500
},
{
"epoch": 1.74,
"grad_norm": 2.0070347785949707,
"learning_rate": 6.521945916564751e-06,
"loss": 1.2175,
"step": 25600
},
{
"epoch": 1.75,
"grad_norm": 1.7913732528686523,
"learning_rate": 6.508357113738281e-06,
"loss": 1.2005,
"step": 25700
},
{
"epoch": 1.75,
"grad_norm": 1.552306890487671,
"learning_rate": 6.494768310911809e-06,
"loss": 1.2011,
"step": 25800
},
{
"epoch": 1.76,
"grad_norm": 3.1545894145965576,
"learning_rate": 6.481179508085338e-06,
"loss": 1.17,
"step": 25900
},
{
"epoch": 1.77,
"grad_norm": 1.7653687000274658,
"learning_rate": 6.467590705258867e-06,
"loss": 1.1984,
"step": 26000
},
{
"epoch": 1.77,
"eval_loss": 1.2038514614105225,
"eval_runtime": 70.8478,
"eval_samples_per_second": 35.894,
"eval_steps_per_second": 4.488,
"step": 26000
},
{
"epoch": 1.77,
"grad_norm": 1.9187010526657104,
"learning_rate": 6.454001902432397e-06,
"loss": 1.2051,
"step": 26100
},
{
"epoch": 1.78,
"grad_norm": 1.6188615560531616,
"learning_rate": 6.440413099605925e-06,
"loss": 1.1712,
"step": 26200
},
{
"epoch": 1.79,
"grad_norm": 1.9360331296920776,
"learning_rate": 6.426824296779455e-06,
"loss": 1.1531,
"step": 26300
},
{
"epoch": 1.79,
"grad_norm": 2.710357189178467,
"learning_rate": 6.413235493952983e-06,
"loss": 1.1707,
"step": 26400
},
{
"epoch": 1.8,
"grad_norm": 2.3331565856933594,
"learning_rate": 6.399646691126512e-06,
"loss": 1.1929,
"step": 26500
},
{
"epoch": 1.8,
"eval_loss": 1.2011253833770752,
"eval_runtime": 70.8706,
"eval_samples_per_second": 35.882,
"eval_steps_per_second": 4.487,
"step": 26500
},
{
"epoch": 1.81,
"grad_norm": 2.030912399291992,
"learning_rate": 6.386057888300041e-06,
"loss": 1.1986,
"step": 26600
},
{
"epoch": 1.81,
"grad_norm": 2.1584174633026123,
"learning_rate": 6.37246908547357e-06,
"loss": 1.2564,
"step": 26700
},
{
"epoch": 1.82,
"grad_norm": 2.3068361282348633,
"learning_rate": 6.3588802826471e-06,
"loss": 1.1933,
"step": 26800
},
{
"epoch": 1.83,
"grad_norm": 1.5643947124481201,
"learning_rate": 6.3452914798206285e-06,
"loss": 1.1691,
"step": 26900
},
{
"epoch": 1.83,
"grad_norm": 2.531083822250366,
"learning_rate": 6.331702676994157e-06,
"loss": 1.1387,
"step": 27000
},
{
"epoch": 1.83,
"eval_loss": 1.1969281435012817,
"eval_runtime": 70.8053,
"eval_samples_per_second": 35.915,
"eval_steps_per_second": 4.491,
"step": 27000
},
{
"epoch": 1.84,
"grad_norm": 1.800430417060852,
"learning_rate": 6.318113874167686e-06,
"loss": 1.1646,
"step": 27100
},
{
"epoch": 1.85,
"grad_norm": 1.8300161361694336,
"learning_rate": 6.304525071341216e-06,
"loss": 1.2104,
"step": 27200
},
{
"epoch": 1.85,
"grad_norm": 1.7128742933273315,
"learning_rate": 6.2909362685147445e-06,
"loss": 1.1932,
"step": 27300
},
{
"epoch": 1.86,
"grad_norm": 1.9414857625961304,
"learning_rate": 6.277347465688274e-06,
"loss": 1.1818,
"step": 27400
},
{
"epoch": 1.87,
"grad_norm": 1.9951707124710083,
"learning_rate": 6.263758662861802e-06,
"loss": 1.2024,
"step": 27500
},
{
"epoch": 1.87,
"eval_loss": 1.1932079792022705,
"eval_runtime": 70.8009,
"eval_samples_per_second": 35.918,
"eval_steps_per_second": 4.491,
"step": 27500
},
{
"epoch": 1.88,
"grad_norm": 1.9903321266174316,
"learning_rate": 6.250169860035331e-06,
"loss": 1.1824,
"step": 27600
},
{
"epoch": 1.88,
"grad_norm": 2.8851983547210693,
"learning_rate": 6.2365810572088605e-06,
"loss": 1.1851,
"step": 27700
},
{
"epoch": 1.89,
"grad_norm": 2.0669078826904297,
"learning_rate": 6.222992254382389e-06,
"loss": 1.1646,
"step": 27800
},
{
"epoch": 1.9,
"grad_norm": 1.9089607000350952,
"learning_rate": 6.209403451555919e-06,
"loss": 1.1776,
"step": 27900
},
{
"epoch": 1.9,
"grad_norm": 2.2551538944244385,
"learning_rate": 6.195814648729448e-06,
"loss": 1.1909,
"step": 28000
},
{
"epoch": 1.9,
"eval_loss": 1.1877614259719849,
"eval_runtime": 70.8691,
"eval_samples_per_second": 35.883,
"eval_steps_per_second": 4.487,
"step": 28000
},
{
"epoch": 1.91,
"grad_norm": 1.5612294673919678,
"learning_rate": 6.182225845902976e-06,
"loss": 1.1559,
"step": 28100
},
{
"epoch": 1.92,
"grad_norm": 2.8218579292297363,
"learning_rate": 6.168637043076505e-06,
"loss": 1.1652,
"step": 28200
},
{
"epoch": 1.92,
"grad_norm": 1.9702138900756836,
"learning_rate": 6.155048240250035e-06,
"loss": 1.1917,
"step": 28300
},
{
"epoch": 1.93,
"grad_norm": 2.3673105239868164,
"learning_rate": 6.141459437423564e-06,
"loss": 1.1336,
"step": 28400
},
{
"epoch": 1.94,
"grad_norm": 1.8467798233032227,
"learning_rate": 6.127870634597093e-06,
"loss": 1.1786,
"step": 28500
},
{
"epoch": 1.94,
"eval_loss": 1.1837141513824463,
"eval_runtime": 70.7666,
"eval_samples_per_second": 35.935,
"eval_steps_per_second": 4.494,
"step": 28500
},
{
"epoch": 1.94,
"grad_norm": 1.830837368965149,
"learning_rate": 6.114281831770621e-06,
"loss": 1.1908,
"step": 28600
},
{
"epoch": 1.95,
"grad_norm": 1.7194899320602417,
"learning_rate": 6.10069302894415e-06,
"loss": 1.1541,
"step": 28700
},
{
"epoch": 1.96,
"grad_norm": 1.798368215560913,
"learning_rate": 6.0871042261176796e-06,
"loss": 1.1487,
"step": 28800
},
{
"epoch": 1.96,
"grad_norm": 1.9699339866638184,
"learning_rate": 6.073515423291208e-06,
"loss": 1.166,
"step": 28900
},
{
"epoch": 1.97,
"grad_norm": 2.1379947662353516,
"learning_rate": 6.059926620464738e-06,
"loss": 1.1724,
"step": 29000
},
{
"epoch": 1.97,
"eval_loss": 1.181123971939087,
"eval_runtime": 92.5562,
"eval_samples_per_second": 27.475,
"eval_steps_per_second": 3.436,
"step": 29000
},
{
"epoch": 1.98,
"grad_norm": 2.405134439468384,
"learning_rate": 6.046337817638267e-06,
"loss": 1.1651,
"step": 29100
},
{
"epoch": 1.98,
"grad_norm": 1.8744902610778809,
"learning_rate": 6.032749014811795e-06,
"loss": 1.201,
"step": 29200
},
{
"epoch": 1.99,
"grad_norm": 3.014401435852051,
"learning_rate": 6.019160211985324e-06,
"loss": 1.1783,
"step": 29300
},
{
"epoch": 2.0,
"grad_norm": 2.104191780090332,
"learning_rate": 6.005571409158854e-06,
"loss": 1.1479,
"step": 29400
},
{
"epoch": 2.0,
"grad_norm": 1.9670746326446533,
"learning_rate": 5.991982606332383e-06,
"loss": 1.1372,
"step": 29500
},
{
"epoch": 2.0,
"eval_loss": 1.176620602607727,
"eval_runtime": 92.464,
"eval_samples_per_second": 27.503,
"eval_steps_per_second": 3.439,
"step": 29500
},
{
"epoch": 2.01,
"grad_norm": 2.3720388412475586,
"learning_rate": 5.978393803505912e-06,
"loss": 1.1476,
"step": 29600
},
{
"epoch": 2.02,
"grad_norm": 2.047060251235962,
"learning_rate": 5.964805000679441e-06,
"loss": 1.1303,
"step": 29700
},
{
"epoch": 2.02,
"grad_norm": 1.9792667627334595,
"learning_rate": 5.951216197852969e-06,
"loss": 1.1462,
"step": 29800
},
{
"epoch": 2.03,
"grad_norm": 2.241187572479248,
"learning_rate": 5.937627395026499e-06,
"loss": 1.1031,
"step": 29900
},
{
"epoch": 2.04,
"grad_norm": 2.0233969688415527,
"learning_rate": 5.9240385922000274e-06,
"loss": 1.1396,
"step": 30000
},
{
"epoch": 2.04,
"eval_loss": 1.1728562116622925,
"eval_runtime": 92.5707,
"eval_samples_per_second": 27.471,
"eval_steps_per_second": 3.435,
"step": 30000
},
{
"epoch": 2.05,
"grad_norm": 2.0683882236480713,
"learning_rate": 5.910449789373557e-06,
"loss": 1.1431,
"step": 30100
},
{
"epoch": 2.05,
"grad_norm": 1.9208968877792358,
"learning_rate": 5.896860986547086e-06,
"loss": 1.1391,
"step": 30200
},
{
"epoch": 2.06,
"grad_norm": 1.6621592044830322,
"learning_rate": 5.883272183720614e-06,
"loss": 1.1361,
"step": 30300
},
{
"epoch": 2.07,
"grad_norm": 1.9728045463562012,
"learning_rate": 5.869683380894143e-06,
"loss": 1.1606,
"step": 30400
},
{
"epoch": 2.07,
"grad_norm": 2.3189892768859863,
"learning_rate": 5.856094578067672e-06,
"loss": 1.1565,
"step": 30500
},
{
"epoch": 2.07,
"eval_loss": 1.1692627668380737,
"eval_runtime": 92.5367,
"eval_samples_per_second": 27.481,
"eval_steps_per_second": 3.436,
"step": 30500
},
{
"epoch": 2.08,
"grad_norm": 1.936689853668213,
"learning_rate": 5.842505775241202e-06,
"loss": 1.1294,
"step": 30600
},
{
"epoch": 2.09,
"grad_norm": 1.4617129564285278,
"learning_rate": 5.8289169724147306e-06,
"loss": 1.1591,
"step": 30700
},
{
"epoch": 2.09,
"grad_norm": 1.5474071502685547,
"learning_rate": 5.81532816958826e-06,
"loss": 1.1377,
"step": 30800
},
{
"epoch": 2.1,
"grad_norm": 1.7175779342651367,
"learning_rate": 5.801739366761788e-06,
"loss": 1.1532,
"step": 30900
},
{
"epoch": 2.11,
"grad_norm": 1.8924795389175415,
"learning_rate": 5.788150563935318e-06,
"loss": 1.1002,
"step": 31000
},
{
"epoch": 2.11,
"eval_loss": 1.1667861938476562,
"eval_runtime": 92.475,
"eval_samples_per_second": 27.499,
"eval_steps_per_second": 3.439,
"step": 31000
},
{
"epoch": 2.11,
"grad_norm": 2.3616528511047363,
"learning_rate": 5.7745617611088465e-06,
"loss": 1.1616,
"step": 31100
},
{
"epoch": 2.12,
"grad_norm": 1.7967276573181152,
"learning_rate": 5.760972958282376e-06,
"loss": 1.1817,
"step": 31200
},
{
"epoch": 2.13,
"grad_norm": 2.9053776264190674,
"learning_rate": 5.747384155455905e-06,
"loss": 1.1611,
"step": 31300
},
{
"epoch": 2.13,
"grad_norm": 2.2042810916900635,
"learning_rate": 5.7337953526294346e-06,
"loss": 1.1394,
"step": 31400
},
{
"epoch": 2.14,
"grad_norm": 1.8876034021377563,
"learning_rate": 5.7202065498029625e-06,
"loss": 1.1171,
"step": 31500
},
{
"epoch": 2.14,
"eval_loss": 1.1626156568527222,
"eval_runtime": 92.4663,
"eval_samples_per_second": 27.502,
"eval_steps_per_second": 3.439,
"step": 31500
}
],
"logging_steps": 100,
"max_steps": 73590,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.55848975386624e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}